From 7891a73ef36f4ad7b71069b3c57694f85bb79454 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 20 Aug 2021 21:39:25 +0100
Subject: Move CPU/GPU files from Core/Runtime to the respective backend
 folders

Legacy structure contained two libraries core/runtime with two backends
in each.
We reduce the core/runtime libraries to a single library thus merging
the backend files

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/CLHelpers.cpp                          |    4 +-
 src/core/CL/CLKernelLibrary.cpp                    |    2 +-
 .../CLDepthwiseConvolutionLayerNativeKernel.cpp    |    2 +-
 src/core/cpu/ICpuKernel.h                          |   36 -
 src/core/cpu/kernels/CpuActivationKernel.cpp       |  260 ---
 src/core/cpu/kernels/CpuActivationKernel.h         |   75 -
 src/core/cpu/kernels/CpuAddKernel.cpp              |  296 ----
 src/core/cpu/kernels/CpuAddKernel.h                |   84 -
 src/core/cpu/kernels/CpuCastKernel.cpp             | 1367 ----------------
 src/core/cpu/kernels/CpuCastKernel.h               |   82 -
 src/core/cpu/kernels/CpuCol2ImKernel.cpp           |  124 --
 src/core/cpu/kernels/CpuCol2ImKernel.h             |   87 -
 src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp |  211 ---
 src/core/cpu/kernels/CpuConcatenateBatchKernel.h   |   73 -
 src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp |  207 ---
 src/core/cpu/kernels/CpuConcatenateDepthKernel.h   |   81 -
 .../cpu/kernels/CpuConcatenateHeightKernel.cpp     |  178 --
 src/core/cpu/kernels/CpuConcatenateHeightKernel.h  |   70 -
 src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp |  175 --
 src/core/cpu/kernels/CpuConcatenateWidthKernel.h   |   70 -
 .../CpuConvertFullyConnectedWeightsKernel.cpp      |  113 --
 .../CpuConvertFullyConnectedWeightsKernel.h        |   76 -
 .../CpuConvertQuantizedSignednessKernel.cpp        |  142 --
 .../kernels/CpuConvertQuantizedSignednessKernel.h  |   63 -
 src/core/cpu/kernels/CpuCopyKernel.cpp             |  166 --
 src/core/cpu/kernels/CpuCopyKernel.h               |   67 -
 .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp |  950 -----------
 .../cpu/kernels/CpuDepthwiseConv2dNativeKernel.h   |  105 --
 src/core/cpu/kernels/CpuDequantizeKernel.cpp       |  400 -----
 src/core/cpu/kernels/CpuDequantizeKernel.h         |   63 -
 src/core/cpu/kernels/CpuDirectConv2dKernel.cpp     | 1385 ----------------
 src/core/cpu/kernels/CpuDirectConv2dKernel.h       |   91 --
 .../kernels/CpuDirectConv2dOutputStageKernel.cpp   |  513 ------
 .../cpu/kernels/CpuDirectConv2dOutputStageKernel.h |   85 -
 src/core/cpu/kernels/CpuElementwiseKernel.cpp      |  454 -----
 src/core/cpu/kernels/CpuElementwiseKernel.h        |  222 ---
 src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp |  182 ---
 src/core/cpu/kernels/CpuElementwiseUnaryKernel.h   |   81 -
 src/core/cpu/kernels/CpuFillKernel.cpp             |   90 -
 src/core/cpu/kernels/CpuFillKernel.h               |   60 -
 src/core/cpu/kernels/CpuFloorKernel.cpp            |  177 --
 src/core/cpu/kernels/CpuFloorKernel.h              |   78 -
 .../cpu/kernels/CpuGemmInterleave4x4Kernel.cpp     |  151 --
 src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h  |   80 -
 .../kernels/CpuGemmLowpMatrixMultiplyKernel.cpp    | 1053 ------------
 .../cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h  |   80 -
 .../kernels/CpuGemmLowpMatrixReductionKernel.cpp   |  396 -----
 .../cpu/kernels/CpuGemmLowpMatrixReductionKernel.h |  157 --
 .../CpuGemmLowpOffsetContributionKernel.cpp        |  417 -----
 .../kernels/CpuGemmLowpOffsetContributionKernel.h  |   88 -
 ...GemmLowpOffsetContributionOutputStageKernel.cpp |  946 -----------
 ...puGemmLowpOffsetContributionOutputStageKernel.h |  114 --
 .../CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp    |  326 ----
 .../CpuGemmLowpQuantizeDownInt32ScaleKernel.h      |  107 --
 ...tizeDownInt32ToInt16ScaleByFixedPointKernel.cpp |  227 ---
 ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h |  111 --
 ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp |  239 ---
 ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h |  114 --
 ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp |  236 ---
 ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h |  108 --
 .../cpu/kernels/CpuGemmMatrixAdditionKernel.cpp    |  200 ---
 src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h |   88 -
 .../cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp    | 1174 -------------
 src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h |   91 --
 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp |  137 --
 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h   |   97 --
 src/core/cpu/kernels/CpuIm2ColKernel.cpp           |  448 -----
 src/core/cpu/kernels/CpuIm2ColKernel.h             |  123 --
 src/core/cpu/kernels/CpuMulKernel.cpp              | 1729 --------------------
 src/core/cpu/kernels/CpuMulKernel.h                |  148 --
 src/core/cpu/kernels/CpuPermuteKernel.cpp          |  301 ----
 src/core/cpu/kernels/CpuPermuteKernel.h            |   69 -
 src/core/cpu/kernels/CpuPool2dKernel.cpp           |  516 ------
 src/core/cpu/kernels/CpuPool2dKernel.h             |   82 -
 src/core/cpu/kernels/CpuQuantizeKernel.cpp         |  266 ---
 src/core/cpu/kernels/CpuQuantizeKernel.h           |   89 -
 src/core/cpu/kernels/CpuReshapeKernel.cpp          |  140 --
 src/core/cpu/kernels/CpuReshapeKernel.h            |   64 -
 src/core/cpu/kernels/CpuScaleKernel.cpp            |  623 -------
 src/core/cpu/kernels/CpuScaleKernel.h              |  108 --
 src/core/cpu/kernels/CpuSoftmaxKernel.cpp          |  378 -----
 src/core/cpu/kernels/CpuSoftmaxKernel.h            |  111 --
 src/core/cpu/kernels/CpuSubKernel.cpp              |  201 ---
 src/core/cpu/kernels/CpuSubKernel.h                |   84 -
 src/core/cpu/kernels/CpuTransposeKernel.cpp        |  510 ------
 src/core/cpu/kernels/CpuTransposeKernel.h          |   63 -
 src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp   |  170 --
 src/core/cpu/kernels/CpuWeightsReshapeKernel.h     |   91 --
 src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp   |  551 -------
 src/core/cpu/kernels/CpuWinogradConv2dKernel.h     |  575 -------
 src/core/cpu/kernels/activation/list.h             |   49 -
 src/core/cpu/kernels/activation/neon/fp16.cpp      |  217 ---
 src/core/cpu/kernels/activation/neon/fp32.cpp      |  212 ---
 src/core/cpu/kernels/activation/neon/qasymm8.cpp   |  262 ---
 .../cpu/kernels/activation/neon/qasymm8_signed.cpp |  261 ---
 src/core/cpu/kernels/activation/neon/qsymm16.cpp   |  138 --
 src/core/cpu/kernels/activation/sve/fp16.cpp       |  130 --
 src/core/cpu/kernels/activation/sve/fp32.cpp       |  131 --
 src/core/cpu/kernels/activation/sve/qasymm8.cpp    |  253 ---
 .../cpu/kernels/activation/sve/qasymm8_signed.cpp  |  253 ---
 src/core/cpu/kernels/activation/sve/qsymm16.cpp    |  120 --
 src/core/cpu/kernels/add/neon/list.h               |  143 --
 src/core/cpu/kernels/add/neon/qasymm8.cpp          |  209 ---
 src/core/cpu/kernels/add/neon/qasymm8_signed.cpp   |  208 ---
 src/core/cpu/kernels/add/neon/qsymm16.cpp          |  174 --
 src/core/cpu/kernels/add/sve/impl.cpp              |  139 --
 src/core/cpu/kernels/add/sve/impl.h                |   40 -
 src/core/cpu/kernels/add/sve/list.h                |   51 -
 src/core/cpu/kernels/add/sve/qasymm8.cpp           |  182 ---
 src/core/cpu/kernels/add/sve/qasymm8_signed.cpp    |  181 --
 src/core/cpu/kernels/add/sve/qsymm16.cpp           |  156 --
 .../assembly/CpuGemmAssemblyWrapperKernel.h        |  126 --
 src/core/cpu/kernels/assembly/arm_gemm.hpp         |  190 ---
 .../kernels/assembly/arm_gemm_compute_iface.hpp    |  130 --
 src/core/cpu/kernels/assembly/arm_gemm_local.hpp   |   31 -
 .../kernels/assembly/convolution_parameters.hpp    |   65 -
 src/core/cpu/kernels/assembly/gemm_common.hpp      |  236 ---
 src/core/cpu/kernels/assembly/ndrange.hpp          |  199 ---
 .../kernels/elementwise/neon/elementwise_list.h    |  486 ------
 .../elementwise/neon/elementwise_quantized_list.h  |  654 --------
 .../elementwise/neon/elementwise_unary_list.h      |  116 --
 .../cpu/kernels/elementwise/sve/elementwise.cpp    |  311 ----
 .../cpu/kernels/elementwise/sve/elementwise_list.h |  171 --
 .../elementwise/sve/elementwise_quantized_list.h   |  366 -----
 .../kernels/elementwise/sve/elementwise_unary.cpp  |  113 --
 .../elementwise/sve/elementwise_unary_list.h       |   39 -
 src/core/cpu/kernels/floor/list.h                  |   41 -
 src/core/cpu/kernels/floor/neon/fp16.cpp           |   64 -
 src/core/cpu/kernels/floor/neon/fp32.cpp           |   61 -
 .../CpuDepthwiseConv2dAssemblyWrapperKernel.cpp    |  359 ----
 .../CpuDepthwiseConv2dAssemblyWrapperKernel.h      |  120 --
 .../internal/CpuPool2dAssemblyWrapperKernel.cpp    |  279 ----
 .../internal/CpuPool2dAssemblyWrapperKernel.h      |  119 --
 src/core/cpu/kernels/pool2d/neon/fp16.cpp          |  317 ----
 src/core/cpu/kernels/pool2d/neon/fp32.cpp          |  314 ----
 src/core/cpu/kernels/pool2d/neon/list.h            |   97 --
 src/core/cpu/kernels/pool2d/neon/nchw/all.cpp      |  700 --------
 src/core/cpu/kernels/pool2d/neon/qasymm8.cpp       |   41 -
 .../cpu/kernels/pool2d/neon/qasymm8_signed.cpp     |   41 -
 src/core/cpu/kernels/pool2d/neon/quantized.h       |  863 ----------
 src/core/cpu/kernels/scale/neon/fp16.cpp           |  174 --
 src/core/cpu/kernels/scale/neon/integer.cpp        |  293 ----
 src/core/cpu/kernels/scale/neon/list.h             |  185 ---
 src/core/cpu/kernels/scale/neon/qasymm8.cpp        |  145 --
 src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp |  145 --
 src/core/cpu/kernels/scale/sve/fp16.cpp            |  176 --
 src/core/cpu/kernels/scale/sve/fp32.cpp            |  174 --
 src/core/cpu/kernels/scale/sve/integer.cpp         |  300 ----
 src/core/cpu/kernels/scale/sve/list.h              |   47 -
 src/core/cpu/kernels/scale/sve/qasymm8.cpp         |  207 ---
 src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp  |  207 ---
 src/core/cpu/kernels/softmax/impl/neon/list.h      |  388 -----
 src/core/cpu/kernels/softmax/impl/sve/impl.cpp     |  185 ---
 src/core/cpu/kernels/softmax/impl/sve/list.h       |  223 ---
 src/core/cpu/kernels/sub/neon/list.h               |  159 --
 src/core/cpu/kernels/sub/neon/qasymm8.cpp          |  230 ---
 src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp   |  229 ---
 src/core/cpu/kernels/sub/neon/qsymm16.cpp          |  201 ---
 src/core/gpu/cl/ClCompileContext.h                 |   36 -
 src/core/gpu/cl/ClKernelLibrary.cpp                | 1029 ------------
 src/core/gpu/cl/ClKernelLibrary.h                  |   95 --
 src/core/gpu/cl/IClKernel.h                        |   37 -
 src/core/gpu/cl/kernels/ClActivationKernel.cpp     |  255 ---
 src/core/gpu/cl/kernels/ClActivationKernel.h       |   71 -
 .../gpu/cl/kernels/ClBatchConcatenateKernel.cpp    |  153 --
 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h |   74 -
 src/core/gpu/cl/kernels/ClCastKernel.cpp           |  168 --
 src/core/gpu/cl/kernels/ClCastKernel.h             |   79 -
 src/core/gpu/cl/kernels/ClCol2ImKernel.cpp         |  175 --
 src/core/gpu/cl/kernels/ClCol2ImKernel.h           |   89 -
 .../ClConvertFullyConnectedWeightsKernel.cpp       |  124 --
 .../kernels/ClConvertFullyConnectedWeightsKernel.h |   73 -
 src/core/gpu/cl/kernels/ClCopyKernel.cpp           |  175 --
 src/core/gpu/cl/kernels/ClCopyKernel.h             |   69 -
 src/core/gpu/cl/kernels/ClCropKernel.cpp           |  136 --
 src/core/gpu/cl/kernels/ClCropKernel.h             |   78 -
 .../gpu/cl/kernels/ClDepthConcatenateKernel.cpp    |  139 --
 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h |   74 -
 src/core/gpu/cl/kernels/ClDequantizeKernel.cpp     |  158 --
 src/core/gpu/cl/kernels/ClDequantizeKernel.h       |   64 -
 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp   |  672 --------
 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h     |   89 -
 src/core/gpu/cl/kernels/ClElementwiseKernel.cpp    |  525 ------
 src/core/gpu/cl/kernels/ClElementwiseKernel.h      |  200 ---
 .../gpu/cl/kernels/ClElementwiseUnaryKernel.cpp    |  168 --
 src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h |   65 -
 src/core/gpu/cl/kernels/ClFillKernel.cpp           |  120 --
 src/core/gpu/cl/kernels/ClFillKernel.h             |   68 -
 src/core/gpu/cl/kernels/ClFloorKernel.cpp          |  124 --
 src/core/gpu/cl/kernels/ClFloorKernel.h            |   64 -
 .../ClGemmLowpMatrixMultiplyNativeKernel.cpp       |  335 ----
 .../kernels/ClGemmLowpMatrixMultiplyNativeKernel.h |   81 -
 .../ClGemmLowpMatrixMultiplyReshapedKernel.cpp     |  300 ----
 .../ClGemmLowpMatrixMultiplyReshapedKernel.h       |   90 -
 ...GemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp |  544 ------
 ...ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h |  100 --
 .../kernels/ClGemmLowpOffsetContributionKernel.cpp |  212 ---
 .../kernels/ClGemmLowpOffsetContributionKernel.h   |   86 -
 ...GemmLowpOffsetContributionOutputStageKernel.cpp |  263 ---
 ...ClGemmLowpOffsetContributionOutputStageKernel.h |   90 -
 ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp |  160 --
 ...mLowpQuantizeDownInt32ScaleByFixedPointKernel.h |   78 -
 ...GemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp |  160 --
 ...ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h |   80 -
 .../ClGemmLowpQuantizeDownInt32ScaleKernel.cpp     |  157 --
 .../ClGemmLowpQuantizeDownInt32ScaleKernel.h       |   80 -
 .../gpu/cl/kernels/ClGemmLowpReductionKernel.cpp   |  219 ---
 .../gpu/cl/kernels/ClGemmLowpReductionKernel.h     |  124 --
 .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp  |  538 ------
 .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h    |   88 -
 .../kernels/ClGemmMatrixMultiplyNativeKernel.cpp   |  416 -----
 .../cl/kernels/ClGemmMatrixMultiplyNativeKernel.h  |   88 -
 .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp |  421 -----
 .../kernels/ClGemmMatrixMultiplyReshapedKernel.h   |  113 --
 .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp  |  443 -----
 .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h    |  104 --
 .../cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp    |  224 ---
 .../gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h  |   78 -
 .../cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp    |  175 --
 .../gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h  |   84 -
 .../gpu/cl/kernels/ClHeightConcatenateKernel.cpp   |  132 --
 .../gpu/cl/kernels/ClHeightConcatenateKernel.h     |   71 -
 src/core/gpu/cl/kernels/ClIm2ColKernel.cpp         |  431 -----
 src/core/gpu/cl/kernels/ClIm2ColKernel.h           |  106 --
 src/core/gpu/cl/kernels/ClMulKernel.cpp            |  439 -----
 src/core/gpu/cl/kernels/ClMulKernel.h              |  118 --
 src/core/gpu/cl/kernels/ClPermuteKernel.cpp        |  152 --
 src/core/gpu/cl/kernels/ClPermuteKernel.h          |   73 -
 src/core/gpu/cl/kernels/ClPool2dKernel.cpp         |  509 ------
 src/core/gpu/cl/kernels/ClPool2dKernel.h           |   75 -
 src/core/gpu/cl/kernels/ClQuantizeKernel.cpp       |  180 --
 src/core/gpu/cl/kernels/ClQuantizeKernel.h         |   69 -
 src/core/gpu/cl/kernels/ClReshapeKernel.cpp        |  134 --
 src/core/gpu/cl/kernels/ClReshapeKernel.h          |   64 -
 src/core/gpu/cl/kernels/ClScaleKernel.cpp          |  213 ---
 src/core/gpu/cl/kernels/ClScaleKernel.h            |   70 -
 src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp        |  365 -----
 src/core/gpu/cl/kernels/ClSoftmaxKernel.h          |  118 --
 src/core/gpu/cl/kernels/ClTransposeKernel.cpp      |  124 --
 src/core/gpu/cl/kernels/ClTransposeKernel.h        |   64 -
 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp |  164 --
 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h   |   93 --
 .../kernels/ClWidthConcatenate2TensorsKernel.cpp   |  159 --
 .../cl/kernels/ClWidthConcatenate2TensorsKernel.h  |   67 -
 .../kernels/ClWidthConcatenate4TensorsKernel.cpp   |  185 ---
 .../cl/kernels/ClWidthConcatenate4TensorsKernel.h  |   70 -
 .../gpu/cl/kernels/ClWidthConcatenateKernel.cpp    |  127 --
 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h |   68 -
 .../cl/kernels/ClWinogradFilterTransformKernel.cpp |  156 --
 .../cl/kernels/ClWinogradFilterTransformKernel.h   |   77 -
 .../cl/kernels/ClWinogradInputTransformKernel.cpp  |  278 ----
 .../cl/kernels/ClWinogradInputTransformKernel.h    |   87 -
 .../cl/kernels/ClWinogradOutputTransformKernel.cpp |  268 ---
 .../cl/kernels/ClWinogradOutputTransformKernel.h   |   85 -
 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp     |  116 --
 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h       |   95 --
 src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h |  123 --
 .../native/ClGemmDefaultConfigNativeBifrost.cpp    |  246 ---
 .../gemm/native/ClGemmDefaultConfigNativeBifrost.h |   62 -
 .../native/ClGemmDefaultConfigNativeMidgard.cpp    |   73 -
 .../gemm/native/ClGemmDefaultConfigNativeMidgard.h |   57 -
 .../native/ClGemmDefaultConfigNativeValhall.cpp    |  168 --
 .../gemm/native/ClGemmDefaultConfigNativeValhall.h |   59 -
 .../kernels/gemm/native/ClGemmNativeKernelConfig.h |   71 -
 .../ClGemmDefaultConfigReshapedBifrost.cpp         |  356 ----
 .../reshaped/ClGemmDefaultConfigReshapedBifrost.h  |   64 -
 .../ClGemmDefaultConfigReshapedValhall.cpp         |  538 ------
 .../reshaped/ClGemmDefaultConfigReshapedValhall.h  |   61 -
 .../gemm/reshaped/ClGemmReshapedKernelConfig.h     |   69 -
 .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp  |  547 -------
 .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.h    |   68 -
 .../ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp  |  570 -------
 .../ClGemmDefaultConfigReshapedRhsOnlyValhall.h    |   61 -
 .../ClGemmReshapedOnlyRhsKernelConfig.h            |   69 -
 src/core/utils/AssemblyUtils.h                     |    2 +-
 275 files changed, 5 insertions(+), 57288 deletions(-)
 delete mode 100644 src/core/cpu/ICpuKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuActivationKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuActivationKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuAddKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuAddKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuCastKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuCastKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuCol2ImKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuCol2ImKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateBatchKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateDepthKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateHeightKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConcatenateWidthKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuCopyKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuCopyKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDequantizeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuElementwiseKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuElementwiseKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuFillKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuFillKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuFloorKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuFloorKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuIm2ColKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuIm2ColKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuMulKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuMulKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuPermuteKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuPermuteKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuPool2dKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuPool2dKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuQuantizeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuReshapeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuReshapeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuScaleKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuScaleKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuSoftmaxKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuSoftmaxKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuSubKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuSubKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuTransposeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuTransposeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuWeightsReshapeKernel.h
 delete mode 100644 src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuWinogradConv2dKernel.h
 delete mode 100644 src/core/cpu/kernels/activation/list.h
 delete mode 100644 src/core/cpu/kernels/activation/neon/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/neon/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/activation/neon/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/activation/neon/qsymm16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/sve/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/sve/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/activation/sve/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/activation/sve/qsymm16.cpp
 delete mode 100644 src/core/cpu/kernels/add/neon/list.h
 delete mode 100644 src/core/cpu/kernels/add/neon/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/add/neon/qsymm16.cpp
 delete mode 100644 src/core/cpu/kernels/add/sve/impl.cpp
 delete mode 100644 src/core/cpu/kernels/add/sve/impl.h
 delete mode 100644 src/core/cpu/kernels/add/sve/list.h
 delete mode 100644 src/core/cpu/kernels/add/sve/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/add/sve/qsymm16.cpp
 delete mode 100644 src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
 delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm.hpp
 delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
 delete mode 100644 src/core/cpu/kernels/assembly/arm_gemm_local.hpp
 delete mode 100644 src/core/cpu/kernels/assembly/convolution_parameters.hpp
 delete mode 100644 src/core/cpu/kernels/assembly/gemm_common.hpp
 delete mode 100644 src/core/cpu/kernels/assembly/ndrange.hpp
 delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_list.h
 delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
 delete mode 100644 src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
 delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise.cpp
 delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_list.h
 delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
 delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
 delete mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
 delete mode 100644 src/core/cpu/kernels/floor/list.h
 delete mode 100644 src/core/cpu/kernels/floor/neon/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/floor/neon/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
 delete mode 100644 src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
 delete mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
 delete mode 100644 src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/list.h
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/nchw/all.cpp
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/pool2d/neon/quantized.h
 delete mode 100644 src/core/cpu/kernels/scale/neon/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/scale/neon/integer.cpp
 delete mode 100644 src/core/cpu/kernels/scale/neon/list.h
 delete mode 100644 src/core/cpu/kernels/scale/neon/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/scale/sve/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/scale/sve/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/scale/sve/integer.cpp
 delete mode 100644 src/core/cpu/kernels/scale/sve/list.h
 delete mode 100644 src/core/cpu/kernels/scale/sve/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/softmax/impl/neon/list.h
 delete mode 100644 src/core/cpu/kernels/softmax/impl/sve/impl.cpp
 delete mode 100644 src/core/cpu/kernels/softmax/impl/sve/list.h
 delete mode 100644 src/core/cpu/kernels/sub/neon/list.h
 delete mode 100644 src/core/cpu/kernels/sub/neon/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/sub/neon/qsymm16.cpp
 delete mode 100644 src/core/gpu/cl/ClCompileContext.h
 delete mode 100644 src/core/gpu/cl/ClKernelLibrary.cpp
 delete mode 100644 src/core/gpu/cl/ClKernelLibrary.h
 delete mode 100644 src/core/gpu/cl/IClKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClActivationKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClActivationKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClCastKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClCastKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClCol2ImKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClCol2ImKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClCopyKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClCopyKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClCropKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClCropKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClDequantizeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClFillKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClFillKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClFloorKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClFloorKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClIm2ColKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClIm2ColKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClMulKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClMulKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClPermuteKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClPermuteKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClPool2dKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClQuantizeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClReshapeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClReshapeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClScaleKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClScaleKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClSoftmaxKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClTransposeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClTransposeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
 delete mode 100644 src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
 delete mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h

(limited to 'src/core')

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 5c53455eeb..10ccc4f9a4 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/ClCompileContext.h"
 
-#include "src/core/gpu/cl/ClKernelLibrary.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <utility>
 #include <vector>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index d8983fcae9..c5a0796c3a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Error.h"
-#include "src/core/gpu/cl/ClKernelLibrary.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
 #include <algorithm>
 #include <array>
 #include <fstream>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 1437b5bebb..2b74f91a05 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -34,9 +34,9 @@
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/cpu/ICpuKernel.h b/src/core/cpu/ICpuKernel.h
deleted file mode 100644
index 650b3a7d0b..0000000000
--- a/src/core/cpu/ICpuKernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPUKERNEL_H
-#define ARM_COMPUTE_ICPUKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using ICpuKernel = arm_compute::ICPPKernel;
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICPUKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
deleted file mode 100644
index dad2ecfc5b..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuActivationKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/activation/list.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ActivationSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-
-using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
-using ActivationKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
-
-struct ActivationKernel
-{
-    const char                 *name;
-    const ActivationSelectorPtr is_selected;
-    ActivationKernelPtr         ukernel;
-};
-
-static const ActivationKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
-    },
-    {
-        "sve_fp32_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE)  */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
-    },
-    {
-        "neon_fp32_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    {
-        "sve_qu8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
-    },
-    {
-        "sve_qs8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
-    },
-    {
-        "sve_qs16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-    {
-        "neon_qu8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
-    },
-    {
-        "neon_qs8_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
-    },
-    {
-        "neon_qs16_activation",
-        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
-    },
-};
-
-const ActivationKernel *get_implementation(const ActivationSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-/* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
-{
-    ActivationLayerInfo::ActivationFunction::RELU,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-};
-/* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 3> qsymm16_activations =
-{
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH
-};
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
-
-    const auto *uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
-                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
-                                    "For QSYMM16 only tanh and logistic are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    if(dst != nullptr)
-    {
-        // dst auto inizialitation if not yet initialized
-        auto_init_if_empty(*dst, *src->clone());
-    }
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
-
-    const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _act_info   = activation_info;
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
-}
-
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
-
-    return Status{};
-}
-
-void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    // Early exit on disabled activation
-    if(!_act_info.enabled())
-    {
-        return;
-    }
-
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, _act_info, window);
-}
-
-const char *CpuActivationKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h
deleted file mode 100644
index 37650345fe..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the activation kernel */
-class CpuActivationKernel : public ICpuKernel
-{
-public:
-    CpuActivationKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]      activation_info Activation layer information.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuActivationKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
-
-private:
-    ActivationLayerInfo _act_info{};
-    ActivationKernelPtr _run_method{ nullptr };
-    std::string         _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
deleted file mode 100644
index 61b7b19443..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuAddKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/add/neon/list.h"
-#include "src/core/cpu/kernels/add/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct AddSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-
-using AddSelectorPtr = std::add_pointer<bool(const AddSelectorData &data)>::type;
-using AddKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-struct AddKernel
-{
-    const char          *name;
-    const AddSelectorPtr is_selected;
-    AddKernelPtr         ukernel;
-};
-
-static const AddKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    {
-        "sve2_qu8_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.ci.has_sve();
-        },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
-    },
-    {
-        "sve2_qs8_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve();
-        },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
-    },
-    {
-        "sve2_qs16_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::QSYMM16) && data.ci.has_sve();
-        },
-        REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
-    },
-#endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::F32) && data.ci.has_sve();
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
-    },
-    {
-        "sve_fp16_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.ci.has_sve();
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
-    },
-    {
-        "sve_u8_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::U8) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
-    },
-    {
-        "sve_s16_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::S16) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
-    },
-    {
-        "sve_s32_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::S32) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_add",
-        [](const AddSelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.ci.has_fp16();
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_u8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
-    },
-    {
-        "neon_s16_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
-    },
-    {
-        "neon_s32_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
-    },
-#endif /*  defined(ARM_COMPUTE_ENABLE_NEON) */
-#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "neon_qu8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
-    },
-    {
-        "neon_qs8_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_add",
-        [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, cpuinfo }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
-                                                                                             || (src1.data_type() != dst.data_type())),
-                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-
-    const auto *uk = get_implementation(CPUInfo::get(), src0.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst)
-{
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    // Auto initialize dst if not initialized
-    set_shape_if_empty(dst, out_shape);
-    set_data_type_if_unknown(dst, src0.data_type());
-
-    Window win = calculate_max_window(out_shape, Steps());
-
-    // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
-    const auto uk = get_implementation(CPUInfo::get(), src0->data_type());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _policy     = policy;
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuAddKernel").append("/").append(uk->name);
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*src0, *src1, *dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first);
-
-    return Status{};
-}
-
-void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src0, src1, dst, _policy, window);
-}
-
-const char *CpuAddKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h
deleted file mode 100644
index 1205b45dfb..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ADD_KERNEL_H
-#define ARM_COMPUTE_CPU_ADD_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform addition between two tensors */
-class CpuAddKernel : public ICpuKernel
-{
-public:
-    CpuAddKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel);
-    /** Initialise the kernel's input, dst and border mode.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (S16,S16)         -> S16
-     *   - (S32,S32)         -> S32
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]  src0   First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in]  src1   Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[out] dst    The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in]  policy Overflow policy.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuAddKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-private:
-    ConvertPolicy _policy{};
-    AddKernelPtr  _run_method{ nullptr };
-    std::string   _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ADD_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuCastKernel.cpp b/src/core/cpu/kernels/CpuCastKernel.cpp
deleted file mode 100644
index 46f3c330ef..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.cpp
+++ /dev/null
@@ -1,1367 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCastKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
-                                                         DataType::F32, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
-                                                         DataType::U32, DataType::S32, DataType::F32);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
-                                                                                     && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                              && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                         && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
-                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
-                                    "Only data_types supported [in] U16 ->  [out] U8, U32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
-                                    "Only data_types supported [in] S16 ->  [out] U8, S32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
-                                    "Only data_types supported [in] BFLOAT16 ->  [out] F32");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
-                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
-                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, BFLOAT16, F16, S32, U8");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
-                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*dst, src->tensor_shape());
-
-    _policy = policy;
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
-    return Status{};
-}
-
-void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16;
-
-    const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *_dst = tensors.get_tensor(TensorType::ACL_DST);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-    ARM_COMPUTE_ERROR_ON(_src == _dst);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src(_src, win);
-    Iterator dst(_dst, win);
-
-    switch(_src->info()->data_type())
-    {
-        case DataType::QASYMM8_SIGNED:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::S16:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr()));
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Up-conversion QASYMM8_SIGNED -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-                            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::S16:
-                {
-                    /* Up-conversion U8 -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion U8 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion U8 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Up-conversion U8 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-                            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::U16:
-                {
-                    /* Up-conversion U8 -> U16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vmovl_u8(vget_low_u8(texels_u8)),
-                                    vmovl_u8(vget_high_u8(texels_u8))
-                                }
-                            };
-
-                            vst1q_u16(dst_ptr + x, texels.val[0]);
-                            vst1q_u16(dst_ptr + x + 8, texels.val[1]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-        case DataType::S16:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion S16 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::U8:
-                {
-                    /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                                  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion S16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_s16(src_ptr + x),
-                                    vld1q_s16(src_ptr + x + 8)
-                                }
-                            };
-
-                            const int32x4x4_t texels_s32 =
-                            {
-                                {
-                                    vmovl_s16(vget_low_s16(texels.val[0])),
-                                    vmovl_s16(vget_high_s16(texels.val[0])),
-                                    vmovl_s16(vget_low_s16(texels.val[1])),
-                                    vmovl_s16(vget_high_s16(texels.val[1]))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, texels_s32.val[0]);
-                            vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
-                            vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
-                            vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-        case DataType::U16:
-        {
-            switch(_dst->info()->data_type())
-            {
-                case DataType::U8:
-                {
-                    /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::U32:
-                {
-                    /* Up-conversion U16 -> U32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_u16(src_ptr + x),
-                                    vld1q_u16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
-                            vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        }
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-        case DataType::BFLOAT16:
-            switch(_dst->info()->data_type())
-            {
-                case DataType::F32:
-                {
-                    /* Up-conversion BFLOAT16 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const uint16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
-                                    vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
-                                }
-                            };
-
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
-                            vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
-                                      vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
-                        }
-
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = float(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type unsupported");
-            }
-            break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            switch(_dst->info()->data_type())
-            {
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8),
-                                }
-                            };
-
-                            vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8),
-                                }
-                            };
-
-                            vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::F32:
-                {
-                    /* Up-conversion F16 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8)
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                            vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                            vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                            vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::S32:
-                {
-                    /* Up-conversion F16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_f16(src_ptr + x),
-                                    vld1q_f16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            switch(_dst->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Down-conversion F32 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12)
-                                }
-                            };
-
-                            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-                            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-                case DataType::BFLOAT16:
-                {
-                    /* Down-conversion F32 -> BFLOAT16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
-                                                   reinterpret_cast<uint16_t *>(dst.ptr()));
-                            wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
-                                                   reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
-                        }
-
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = *(src_ptr + x);
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-                case DataType::S32:
-                {
-                    /* Conversion F32 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion F32 -> U8 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion F32 -> QASYMM8_SIGNED */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_f32(src_ptr + x),
-                                    vld1q_f32(src_ptr + x + 4),
-                                    vld1q_f32(src_ptr + x + 8),
-                                    vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-
-        case DataType::S32:
-            switch(_dst->info()->data_type())
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                {
-                    /* Down-conversion S32 -> F16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const float32x4x4_t texels =
-                            {
-                                {
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                                }
-                            };
-
-                            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-                            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                case DataType::F32:
-                {
-                    /* Conversion S32 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                        {
-                            const int32x4x4_t texels =
-                            {
-                                {
-                                    vld1q_s32(src_ptr + x),
-                                    vld1q_s32(src_ptr + x + 4),
-                                    vld1q_s32(src_ptr + x + 8),
-                                    vld1q_s32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    /* Down-conversion S32 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12),
-                                    }
-                                };
-                                vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-
-                                vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                case DataType::QASYMM8:
-                case DataType::U8:
-                {
-                    /* Down-conversion S32 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-                                vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
-                                vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    else
-                    {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-
-                                vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
-                                vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
-                            }
-
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
-                    }
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("dst data type not supported");
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-const char *CpuCastKernel::name() const
-{
-    return "CpuCastKernel.cpp";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCastKernel.h b/src/core/cpu/kernels/CpuCastKernel.h
deleted file mode 100644
index 2a75c5850e..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H
-#define ARM_COMPUTE_CPU_CAST_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Casts a given tensor to a new type
- *
- * @note When casting between quantized types the scale and zeroPoint are ignored
- */
-class CpuCastKernel : public ICpuKernel
-{
-public:
-    CpuCastKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel);
-    /** Set the src and dst of the kernel
-     *
-     * Valid conversions src -> dst :
-     *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8        -> U16, S16, S32, F32, F16
-     *   - U8             -> U16, S16, S32, F32, F16
-     *   - U16            -> U8, U32
-     *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - BFLOAT16       -> F32
-     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
-     *
-     * @param[in]  src    The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[out] dst    The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in]  policy Conversion policy.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuCastKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    ConvertPolicy _policy{ ConvertPolicy::SATURATE };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuCol2ImKernel.cpp b/src/core/cpu/kernels/CpuCol2ImKernel.cpp
deleted file mode 100644
index f860825de6..0000000000
--- a/src/core/cpu/kernels/CpuCol2ImKernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims)
-{
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    // Validate configured output
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuCol2ImKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims));
-
-    _convolved_dims = convolved_dims;
-
-    // Configure kernel window
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, false)));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *output, const Size2D &convolved_dims)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, output, convolved_dims));
-    return Status{};
-}
-
-void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const uint8_t el_size         = src->info()->element_size();
-    const int     output_stride_x = dst->info()->strides_in_bytes().x();
-    const int     output_stride_y = dst->info()->strides_in_bytes().y();
-    const int     output_stride_z = dst->info()->strides_in_bytes().z();
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Create iterators
-    Iterator in(src, window);
-    Iterator out(dst, window_out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
-        std::memcpy(out.ptr() + idx, in.ptr(), el_size);
-    },
-    in, out);
-}
-
-const char *CpuCol2ImKernel::name() const
-{
-    return "CpuCol2ImKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuCol2ImKernel.h b/src/core/cpu/kernels/CpuCol2ImKernel.h
deleted file mode 100644
index 3c1802230b..0000000000
--- a/src/core/cpu/kernels/CpuCol2ImKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_COL2IM_KERNEL_H
-#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform col2im reshaping.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CpuIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CpuCol2ImKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuCol2ImKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCol2ImKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  src            The input tensor info to convert. Data types supported: All
-     * @param[out] dst            The output tensor info. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuCol2ImKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    Size2D _convolved_dims{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_COL2IM_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
deleted file mode 100644
index 16c0efc793..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window)
-{
-    // Offset src
-    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    // Offset dst
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / dst->info()->element_size();
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                dt        = src->info()->data_type();
-    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
-
-    _func         = nullptr;
-    _batch_offset = batch_offset;
-
-    switch(src->data_type())
-    {
-        case DataType::S8:
-        case DataType::U8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            _func = &batch_concat<uint8_t>;
-            break;
-        case DataType::S16:
-        case DataType::U16:
-        case DataType::F16:
-            _func = &batch_concat<uint16_t>;
-            break;
-        case DataType::S32:
-        case DataType::U32:
-        case DataType::F32:
-            _func = &batch_concat<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src,
-                                           unsigned int                    batch_offset,
-                                           const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _batch_offset,
-             window);
-}
-
-const char *CpuConcatenateBatchKernel::name() const
-{
-    return "CpuConcatenateBatchKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h b/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
deleted file mode 100644
index 1706926fa8..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateBatchKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateBatchKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     */
-    void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConcatenateBatchKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
-    BatchConcatFunction *_func{ nullptr };
-    unsigned int         _batch_offset{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
deleted file mode 100644
index 133499deb6..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window)
-{
-    // Offset source
-    uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    // Offset destination
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / dst->info()->element_size();
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                dt        = src->info()->data_type();
-    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    return Status{};
-}
-} // namespace
-
-void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
-
-    _func         = nullptr;
-    _depth_offset = depth_offset;
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &depth_concat<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &depth_concat<int8_t>;
-            break;
-        case DataType::F16:
-            _func = &depth_concat<uint16_t>;
-            break;
-        case DataType::F32:
-            _func = &depth_concat<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src,
-                                           unsigned int                    depth_offset,
-                                           const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _depth_offset,
-             window);
-}
-
-const char *CpuConcatenateDepthKernel::name() const
-{
-    return "CpuConcatenateDepthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h b/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
deleted file mode 100644
index 3ec19a86d1..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateDepthKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateDepthKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConcatenateDepthKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
-    DepthConcatFunction *_func{ nullptr };
-    unsigned int         _depth_offset{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
deleted file mode 100644
index dfd442b10a..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
-
-    _height_offset = height_offset;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Offset destination pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
-    const int  window_step_x  = 16;
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
-
-    // Create iterators
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    const DataType                 dt        = src->info()->data_type();
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuConcatenateHeightKernel::name() const
-{
-    return "CpuConcatenateHeightKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h b/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
deleted file mode 100644
index e5e15e1aee..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the height concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateHeightKernel : public ICpuKernel
-{
-public:
-    CpuConcatenateHeightKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src           Source tensor info. Data types supported: All
-     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in,out] dst           Destination tensor info. Data types supported: Same as @p src.
-     *
-     */
-    void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConcatenateHeightKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _height_offset{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
deleted file mode 100644
index ad33b0c951..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
-    ARM_COMPUTE_UNUSED(dst);
-
-    _width_offset = width_offset;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
-    return Status{};
-}
-
-void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Offset output pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
-
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
-    constexpr int window_step_x  = 16;
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator                       src_it(src, win);
-    Iterator                       dst_it(dst, win);
-    const DataType                 dt        = src->info()->data_type();
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuConcatenateWidthKernel::name() const
-{
-    return "CpuConcatenateWidthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h b/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
deleted file mode 100644
index f64191e173..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateWidthKernel : public ICPPKernel
-{
-public:
-    CpuConcatenateWidthKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]     src          Source tensor info. Data types supported: All
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] dst          Destination tensor info. Data types supported: Same as @p src.
-     */
-    void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConcatenateWidthKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _width_offset{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index 5406356bc9..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
-
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialisation if not yet initialized
-    auto_init_if_empty(*dst, *src->clone());
-
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
-
-    const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
-    const int width_idx   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
-    const unsigned int num_channels              = original_input_shape[channel_idx];
-
-    _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
-    _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                       DataLayout data_layout)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
-    // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x();
-    const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y();
-    const unsigned int element_size = src->info()->element_size();
-
-    Iterator input(src, window);
-    Iterator output(dst, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
-    },
-    input);
-}
-
-const char *CpuConvertFullyConnectedWeightsKernel::name() const
-{
-    return "CpuConvertFullyConnectedWeightsKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index 7baaf13417..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
-#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel
-{
-public:
-    CpuConvertFullyConnectedWeightsKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel);
-    /** Set the src and dst tensor.
-     *
-     * @param[in] src                  Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] dst                  The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
deleted file mode 100644
index 26cbb48deb..0000000000
--- a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-
-    // Validate output if initialized
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Output auto inizialitation if not yet initialized
-    {
-        const bool                    is_input_signed   = src->data_type() == DataType::QASYMM8_SIGNED;
-        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo qinfo             = src->quantization_info().uniform();
-        const int                     offset_correction = is_input_signed ? -128 : 128;
-        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
-
-        auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
-    }
-
-    return std::make_pair(Status{}, calculate_max_window(*dst));
-}
-} // namespace
-
-void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    std::pair<Status, Window> win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const uint8_t mask  = 128;
-    const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
-            *(output_ptr + x) = in ^ mask;
-        }
-    },
-    input, output);
-}
-
-const char *CpuConvertQuantizedSignednessKernel::name() const
-{
-    return "CpuConvertQuantizedSignednessKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
deleted file mode 100644
index 2a8f6c364d..0000000000
--- a/src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
-#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class CpuConvertQuantizedSignednessKernel : public ICpuKernel
-{
-public:
-    CpuConvertQuantizedSignednessKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel);
-    /** Initialize the kernel input and output info.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[out] dst Destination tensor info. Data types supported: opposite of @p src.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuConvertQuantizedSignednessKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuCopyKernel.cpp b/src/core/cpu/kernels/CpuCopyKernel.cpp
deleted file mode 100644
index 8ec354b2aa..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCopyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList())
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
-
-    // Validate destination if initialized
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Destination auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, *src);
-    return std::make_pair(Status{}, calculate_max_window(*dst));
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
-    const TensorShape src_shape    = src->tensor_shape();
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape));
-    // Configure window
-    const Window win = calculate_max_window(*dst, dst->dimension(0));
-    return std::make_pair(Status{}, win);
-}
-
-} // namespace
-
-void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding));
-
-    _padding = padding;
-
-    std::pair<Status, Window> win_config;
-    if(padding.empty())
-    {
-        win_config = validate_and_configure_window(src, dst);
-    }
-    else
-    {
-        win_config = validate_and_configure_window_with_padding(src, dst, padding);
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
-
-    if(padding.empty())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
-    }
-
-    return Status{};
-}
-
-void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_padding.empty())
-    {
-        Window dst_window{ window };
-        dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
-        Window out_slice = dst_window.first_slice_window_1D();
-        do
-        {
-            Iterator src_it(src, out_slice);
-            Iterator dst_it(dst, out_slice);
-
-            execute_window_loop(out_slice, [&](const Coordinates &)
-            {
-                memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
-            },
-            src_it, dst_it);
-        }
-        while(dst_window.slide_window_slice_1D(out_slice));
-    }
-    else
-    {
-        Window src_window{ window };
-        src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
-
-        Iterator     src_it(src, src_window);
-        Iterator     dst_it(dst, window);
-        const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
-            std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
-        },
-        src_it, dst_it);
-    }
-}
-
-const char *CpuCopyKernel::name() const
-{
-    return "CpuCopyKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCopyKernel.h b/src/core/cpu/kernels/CpuCopyKernel.h
deleted file mode 100644
index e2f1ed60a6..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H
-#define ARM_COMPUTE_CPU_COPY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform a copy between two tensors */
-class CpuCopyKernel : public ICpuKernel
-{
-public:
-    CpuCopyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src     Source tensor. Data types supported: All
-     * @param[out] dst     Destination tensor. Data types supported: same as @p src.
-     * @param[in]  padding (Optional) Padding to be applied to the input tensor
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuCopyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PaddingList _padding{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
deleted file mode 100644
index 5530eba9f1..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ /dev/null
@@ -1,950 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
-constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size          = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
-    const size_t   num_read_elements_per_iteration;
-    const uint32_t x_start;
-    const uint32_t x_end;
-    const uint32_t x_step;
-    const uint32_t x_leftover_start;
-    const size_t   input_stride_y;
-    const size_t   input_stride_z;
-    const size_t   input_max_offset;
-    const size_t   weights_width;
-    const size_t   weights_height;
-    const size_t   weights_stride_y;
-    const size_t   weights_stride_z;
-    const size_t   conv_stride_x;
-    const size_t   conv_stride_y;
-    const size_t   conv_pad_left;
-    const size_t   conv_pad_top;
-    const size_t   input_height;
-    const size_t   input_width;
-    const size_t   input_depth;
-
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
-          x_start(w.x().start()),
-          x_end(w.x().end()),
-          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
-          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
-          input_stride_y(input.strides_in_bytes().y()),
-          input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
-          weights_width(weights.dimension(width_idx)),
-          weights_height(weights.dimension(height_idx)),
-          weights_stride_y(weights.strides_in_bytes().y()),
-          weights_stride_z(weights.strides_in_bytes().z()),
-          conv_stride_x(conv_info.stride().first),
-          conv_stride_y(conv_info.stride().second),
-          conv_pad_left(conv_info.pad_left()),
-          conv_pad_top(conv_info.pad_top()),
-          input_height(input.dimension(height_idx)),
-          input_width(input.dimension(width_idx)),
-          input_depth(input.dimension(channel_idx))
-    {
-    }
-};
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
-{
-    return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
-{
-    return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
-{
-    const int32x4_t shift = vdupq_n_s32(-exponent);
-    const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-    const int32x4_t fixed = vqaddq_s32(x, fixup);
-    return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
-{
-    const int32x2_t shift = vdup_n_s32(-exponent);
-    const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-    const int32x2_t fixed = vqadd_s32(x, fixup);
-    return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
-{
-    const int32x2_t xs = vdup_n_s32(x);
-    return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
-    const int32_t current_h  = base_h + h * dilation.y();
-    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
-    const int32_t current_w  = base_w + w * dilation.x();
-    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
-    return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
-{
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
-    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
-
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
-{
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
-            }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
-            {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
-    constexpr auto element_per_vector = vector_size / sizeof(T);
-    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
-    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-    using AccType                     = int32_t;
-    using AccArrayType                = std::array<AccType, element_per_vector>;
-
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
-    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
-    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, dim_single_unit_step);
-
-    Window win_input = window;
-    win_input.set(Window::DimX, dim_manual_loop);
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = win_input;
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set(Window::DimX, dim_manual_loop);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
-        {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    for(size_t i = 0; i < element_per_vector; ++i)
-                    {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
-
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
-                }
-
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            T out_vals{ 0 };
-
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
-
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
-
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
-
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
-            }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    using AccType = int32_t;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
-
-    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
-    const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
-                {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
-
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
-
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-            }
-
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
-    constexpr int half_vec = vector_size / 2;
-
-    using AccType          = int32_t;
-    using AccVectorType    = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
-    using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
-    using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
-
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
-
-    const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
-    const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
-    const auto zero  = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
-
-    const auto out_mul   = output_multiplier.at(0);
-    const auto out_shift = output_shift.at(0);
-
-    Window execution_window = window;
-    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
-    Window win_input = execution_window;
-    win_input.set(Window::DimY, dim_manual_loop);
-    win_input.set(Window::DimZ, dim_manual_loop);
-
-    Window win_weights = window;
-    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
-    win_weights.set(Window::DimY, dim_manual_loop);
-    win_weights.set(Window::DimZ, dim_manual_loop);
-    win_weights.set(Window::DimW, dim_manual_loop);
-
-    Window win_output = window;
-    win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
-    Iterator input_it(src, win_input);
-    Iterator weights_it(weights, win_weights);
-    Iterator output_it(dst, win_output);
-    Iterator biases_it{};
-
-    if(has_biases)
-    {
-        biases_it = Iterator(biases, win_weights);
-    }
-
-    std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
-    std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
-
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
-            {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
-                    {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-                        {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
-                        }
-                    }
-
-                    offs += dilation.x() * run_info.input_stride_y;
-                }
-            }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
-            {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
-
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
-
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
-
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
-
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
-
-        if(is_data_type_quantized_asymmetric(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
-
-    _conv_info        = info.pad_stride_info;
-    _depth_multiplier = info.depth_multiplier;
-    _dilation         = info.dilation;
-    _has_biases       = (biases != nullptr);
-
-    if(is_data_type_quantized(src->data_type()))
-    {
-        const auto input_scale  = src->quantization_info().uniform().scale;
-        const auto output_scale = dst->quantization_info().uniform().scale;
-
-        auto weights_scale = weights->quantization_info().scale();
-        if(!is_data_type_quantized_per_channel(weights->data_type()))
-        {
-            for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
-            {
-                weights_scale.push_back(weights_scale.front());
-            }
-        }
-
-        for(const auto &s : weights_scale)
-        {
-            int32_t     out_mult   = 0;
-            int32_t     out_shift  = 0;
-            const float multiplier = input_scale * s / output_scale;
-            arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
-
-            _output_multiplier.push_back(out_mult);
-            _output_shift.push_back(out_shift);
-        }
-    }
-
-    switch(weights->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            if(src->data_type() == DataType::QASYMM8)
-            {
-                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
-            }
-            else
-            {
-                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
-
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
-    return Status{};
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                   ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
-    }
-    else
-    {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
-    }
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                   ITensor *dst, const Window &window, bool has_biases)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    if(_depth_multiplier == 1)
-    {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
-    }
-    else
-    {
-        const bool is_pow2                 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
-        const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
-
-        if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
-        {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-        else
-        {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
-        }
-    }
-}
-
-void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    const auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, weights, biases, dst, window, _has_biases);
-}
-
-const char *CpuDepthwiseConv2dNativeKernel::name() const
-{
-    return "CpuDepthwiseConv2dNativeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
deleted file mode 100644
index eb7041f7b6..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
-{
-public:
-    CpuDepthwiseConv2dNativeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
-
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
-     * @param[in]  info    Depthwise convolution meta-data.
-     *
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    template <typename T>
-    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
-    template <typename T, typename TW, FloatEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    template <typename T>
-    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
-    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
-    void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func{ nullptr };
-    PadStrideInfo        _conv_info{};
-    unsigned int         _depth_multiplier{ 1 };
-    Size2D               _dilation{};
-    std::vector<int>     _output_multiplier{};
-    std::vector<int>     _output_shift{};
-    bool                 _has_biases{ false };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.cpp b/src/core/cpu/kernels/CpuDequantizeKernel.cpp
deleted file mode 100644
index 42b5439697..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x4_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-    wrapper::vstore(ptr + 8, v.val[2]);
-    wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-    wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x2_t &v)
-{
-    ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, v.val[0]);
-    wrapper::vstore(ptr + 4, v.val[1]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
-{
-    wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename TOut, typename TIn>
-void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
-    const float                    scale  = qinfo.scale;
-    const int32_t                  offset = qinfo.offset;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
-
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
-{
-    const auto scale = input->info()->quantization_info().scale();
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Reset first dimension to handle tail calculations manually
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win);
-    Iterator out(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const float32x4x4_t vscale =
-            {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
-{
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
-    const float                    scale = qinfo.scale;
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create iterators
-    Iterator in(input, win_collapsed);
-    Iterator out(output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
-{
-    switch(input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            run_dequantization_qasymm8<T, uint8_t>(input, output, window);
-            break;
-        case DataType::QASYMM8_SIGNED:
-            run_dequantization_qasymm8<T, int8_t>(input, output, window);
-            break;
-        case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
-            break;
-        case DataType::QSYMM8:
-            run_dequantization_qsymm8<T>(input, output, window);
-            break;
-        case DataType::QSYMM16:
-            run_dequantization_qsymm16<T>(input, output, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-} // namespace
-
-void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(dst->info()->data_type())
-    {
-        case DataType::F32:
-            run_dequantization_core<float>(src, dst, window);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            run_dequantization_core<float16_t>(src, dst, window);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-}
-const char *CpuDequantizeKernel::name() const
-{
-    return "CpuDequantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.h b/src/core/cpu/kernels/CpuDequantizeKernel.h
deleted file mode 100644
index e80aa3aaad..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class CpuDequantizeKernel : public ICpuKernel
-{
-public:
-    CpuDequantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel);
-    /** Set input, output tensors.
-     *
-     * @param[in]  src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuDequantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
deleted file mode 100644
index faff55e905..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ /dev/null
@@ -1,1385 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute::detail;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <unsigned int stridex>
-float16x8_t internal_vld1q(const float16_t *in);
-
-template <>
-float16x8_t internal_vld1q<1>(const float16_t *in)
-{
-    return vld1q_f16(in);
-}
-
-template <>
-float16x8_t internal_vld1q<2>(const float16_t *in)
-{
-    const float16x8x2_t tmp = vld2q_f16(in);
-    return tmp.val[0];
-}
-
-template <>
-float16x8_t internal_vld1q<3>(const float16_t *in)
-{
-    const float16x8x3_t tmp = vld3q_f16(in);
-    return tmp.val[0];
-}
-
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
-    return vdupq_n_f16(v);
-}
-
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
-    vst1q_f16(p, v);
-}
-
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
-{
-    return vmulq_f16(x, y);
-}
-
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
-{
-    return vaddq_f16(x, vmulq_f16(y, z));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <unsigned int stridex>
-float32x4_t internal_vld1q(const float *in);
-
-template <>
-float32x4_t internal_vld1q<1>(const float *in)
-{
-    return vld1q_f32(in);
-}
-
-template <>
-float32x4_t internal_vld1q<2>(const float *in)
-{
-    const float32x4x2_t tmp = vld2q_f32(in);
-    return tmp.val[0];
-}
-
-template <>
-float32x4_t internal_vld1q<3>(const float *in)
-{
-    const float32x4x3_t tmp = vld3q_f32(in);
-    return tmp.val[0];
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
-    vst1q_f32(p, v);
-}
-
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
-{
-    return vmulq_f32(x, y);
-}
-
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
-{
-    return vmlaq_f32(x, y, z);
-}
-
-constexpr int small_tensor_size_optim = 8;
-inline bool run_optim_small_tensor_info(const ITensorInfo *t)
-{
-    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
-}
-
-inline bool run_optim_small_tensor(const ITensor *t)
-{
-    return run_optim_small_tensor_info(t->info());
-}
-
-// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
-// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
-template <unsigned int stridex>
-class convolver_w1x1_i8x8_f32
-{
-public:
-    static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
-        ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
-                accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
-                auto p_out_base                                                                               = out_ptr + oz * output_stride_z;
-                for(int p = 0; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk0   = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      v_in0     = internal_vld1q<stridex>(in_val);
-                        auto      v_in1     = internal_vld1q<stridex>(in_val + 4);
-                        accum0[oh]          = vmlaq_f32(accum0[oh], vk0, v_in0);
-                        accum1[oh]          = vmlaq_f32(accum1[oh], vk0, v_in1);
-                    }
-                }
-                for(oh = 0; oh < output_h; ++oh)
-                {
-                    auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
-                    vst1q_f32(p_out, accum0[oh]);
-                    vst1q_f32(p_out + 4, accum1[oh]);
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_1x1
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          range_z         = window.z().end() - window.z().start();
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window   window_k = calculate_max_window(*weights->info(), Steps(1u));
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            /*
-                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
-            */
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < range_z; ++oz)
-            {
-                auto p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
-                    const auto vk    = internal_vdupq_n(*k_val);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        const int offset_xy = ih * input_stride_y;
-                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
-                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
-                        {
-                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <unsigned int stridex>
-float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                           const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
-
-inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
-{
-    const float32x4x3_t m00 =
-    {
-        {
-            vld1q_dup_f32(m0),
-            vld1q_dup_f32(m1),
-            vld1q_dup_f32(m2)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
-{
-    const float32x4x2_t m00 =
-    {
-        {
-            vld1q_dup_f32(m3),
-            vld1q_dup_f32(m4)
-        }
-    };
-    return m00;
-}
-
-inline float32x4x3_t load_input(const float *const in)
-{
-    const float32x4x3_t vin =
-    {
-        {
-            vld1q_f32(in),
-            vld1q_f32(in + 4),
-            vld1q_f32(in + 8)
-        }
-    };
-    return vin;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    const float32x4x3_t vin0 = load_input(in_0);
-    const float32x4x3_t vin1 = load_input(in_1);
-    const float32x4x3_t vin2 = load_input(in_2);
-    const float32x4x3_t vin3 = load_input(in_3);
-    const float32x4x3_t vin4 = load_input(in_4);
-    const float32x4x3_t m00  = load_matrix_hi(m0, 1 + m0, 2 + m0);
-    const float32x4x2_t m01  = load_matrix_lo(3 + m0, 4 + m0);
-    const float32x4x3_t m10  = load_matrix_hi(m1, 1 + m1, 2 + m1);
-    const float32x4x2_t m11  = load_matrix_lo(3 + m1, 4 + m1);
-    const float32x4x3_t m20  = load_matrix_hi(m2, 1 + m2, 2 + m2);
-    const float32x4x2_t m21  = load_matrix_lo(3 + m2, 4 + m2);
-    const float32x4x3_t m30  = load_matrix_hi(m3, 1 + m3, 2 + m3);
-    const float32x4x2_t m31  = load_matrix_lo(3 + m3, 4 + m3);
-    const float32x4x3_t m40  = load_matrix_hi(m4, 1 + m4, 2 + m4);
-    const float32x4x2_t m41  = load_matrix_lo(3 + m4, 4 + m4);
-
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vin0.val[0], m00.val[0]),
-            vmulq_f32(vin0.val[1], m00.val[0])
-        }
-    };
-
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
-
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
-                                     const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
-    float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            /*
-                    Each thread executing this kernel computes one or more output's volume planes.
-
-                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
-                    the third thread [16,24] and the fourth thread [25,31].
-
-                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
-                    is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
-
-                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
-                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
-                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
-            */
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
-                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
-                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
-                    const uint8_t *input_base = input_ptr + p * input_stride_z;
-                    const auto     ptr_k_r0   = reinterpret_cast<const T1 *>(ptr_k_base);
-                    const auto     ptr_k_r1   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
-                    const auto     ptr_k_r2   = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
-                    const auto     vk_r0      = load_matrix_row(ptr_k_r0);
-                    const auto     vk_r1      = load_matrix_row(ptr_k_r1);
-                    const auto     vk_r2      = load_matrix_row(ptr_k_r2);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
-                        auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
-                        auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
-                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_5x5
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-    {
-        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
-        const int          input_stride_x  = src->info()->strides_in_bytes().x();
-        const int          input_stride_y  = src->info()->strides_in_bytes().y();
-        const int          input_stride_z  = src->info()->strides_in_bytes().z();
-        const int          output_stride_y = dst->info()->strides_in_bytes().y();
-        const int          output_stride_z = dst->info()->strides_in_bytes().z();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          kernel_stride_w = weights->info()->strides_in_bytes()[3];
-        const int          output_w        = dst->info()->dimension(0);
-        const int          output_h        = dst->info()->dimension(1);
-        const int          num_planes_z    = window.z().end() - window.z().start();
-        const int          delta_input     = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
-        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator out(dst, window_out);
-        Iterator in(src, window_in);
-        Iterator k(weights, window_k);
-
-        const uint8_t *k_ptr = k.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
-            uint8_t       *out_ptr   = out.ptr();
-            int            ih        = 0;
-            int            oh        = 0;
-            for(int oz = 0; oz < num_planes_z; ++oz)
-            {
-                const int zoffset    = id.z() + oz;
-                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
-                // Step 1
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            store_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-                // Step 2
-                for(int p = 1; p < kernel_depth; ++p)
-                {
-                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
-                    const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-
-                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-                    {
-                        auto in_0  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
-                        auto in_1  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
-                        auto in_2  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
-                        auto in_3  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
-                        auto in_4  = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
-                        auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
-                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                            in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
-                        {
-                            auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
-                            accumulate_results<stridex>(p_out, vres);
-                        }
-                    }
-                }
-            }
-        },
-        in, out);
-    }
-};
-
-float vreduce(const float32x4_t &v)
-{
-    auto v0    = wrapper::vgethigh(v);
-    auto v1    = wrapper::vgetlow(v);
-    auto v_out = wrapper::vadd(v0, v1);
-
-    float a = wrapper::vgetlane(v_out, 0);
-    float b = wrapper::vgetlane(v_out, 1);
-    return a + b;
-}
-
-template <typename T1, typename T2>
-inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <>
-inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                                       const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    if(run_optim_small_tensor(src))
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-    else
-    {
-        switch(conv_stride_x)
-        {
-            case 1:
-                convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 2:
-                convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            case 3:
-                convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T1, typename T2>
-inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
-                         const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 2:
-            convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        case 3:
-            convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
-
-    // Checks performed when output is configured
-    if(dst->total_size() != 0)
-    {
-        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-        DataType data_type = src->data_type();
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
-                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
-{
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-
-    // Calculate right and bottom border
-    unsigned int kernel_size   = weights->dimension(width_idx);
-    const int    conv_stride_x = std::get<0>(conv_info.stride());
-    const int    conv_stride_y = std::get<1>(conv_info.stride());
-    const int    input_width   = src->dimension(width_idx);
-
-    Window win{};
-    bool   window_changed = false;
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->data_type())
-                {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_elems_written_per_iteration = 8;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    case DataType::F32:
-                        if(run_optim_small_tensor_info(src))
-                        {
-                            num_elems_written_per_iteration = 8;
-                        }
-                        else
-                        {
-                            num_elems_written_per_iteration = 4;
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                num_weight_elems_read_per_row = kernel_size;
-                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
-                break;
-            }
-            case 3:
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 24;
-                        num_elems_written_per_iteration = 32 >> conv_stride_x;
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-                break;
-            case 5:
-            {
-                switch(src->data_type())
-                {
-                    case DataType::F32:
-                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                        num_elems_read_per_iteration    = 12;
-                        num_elems_written_per_iteration = 16 >> conv_stride_x;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported.");
-                        break;
-                }
-            }
-            break;
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-                break;
-            }
-        }
-
-        // Calculate right pad
-        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
-        int end_x         = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
-        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
-
-        // Calculate border
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
-        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
-        border_size.left   = conv_pad_left;
-        border_size.top    = conv_pad_top;
-        border_size.right  = conv_pad_right;
-        border_size.bottom = conv_pad_bottom;
-
-        // Configure window
-        win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
-                                           num_elems_read_per_iteration, kernel_size,
-                                           conv_stride_x, conv_stride_y);
-        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
-        AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-    }
-    else
-    {
-        // Configure window NHWC without any padding
-        win = calculate_max_window(*dst, Steps());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
-{
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // This function assumes that input and weights have not padding in channel
-
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-    /*
-     * This implementation parallelize the full WC plane of input and weights by
-     * treating them as series of elements. So for example, a 3x3 weights and
-     * floating point vector operations of 4 elements per time, the first 3
-     * channel elements of the first row would be taken and additionally the first
-     * element of the second row. The 9 elements in each single WC weight plane
-     * would require 2 4-element vector operations and a last single element operation.
-     *
-     * This works since when we create the input vector to multiply with the weights,
-     * the exact required elements are loaded in the same order. Therefore the
-     * multiplication works on the correct input/weight elements.
-     */
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        /*
-         * In here we create theoretical indexes which then we validate for both
-         * inputs and weights.
-         * As a reminder, this loop take each output point in NHW, C is treated
-         * in the weights loop.
-         */
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-        const int index_h_start  = in_h_start - in_h_start_t;
-        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
-             * This is the loop in the weights, and it goes along N (the batches)
-             * As a reminder, the batches of the weights are translated into the
-             * channels of the output
-             */
-            const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-            {
-                const T    *in_ptr_mover = in_ptr_row;
-                int         index_wc     = index_wc_start;
-                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                {
-                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                }
-                out_temp += vreduce(out_temp_vec);
-                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                {
-                    const auto src_val = *(in_ptr_mover);
-                    const auto w_val   = *(weights_ptr_row + index_wc);
-                    out_temp += src_val * w_val;
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
-    // Declare useful types
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    // Scalar quantities
-    const int element_size   = src->info()->element_size();
-    const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
-    const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
-    const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
-    const int input_dim_w    = src->info()->dimension(1);
-    const int input_dim_h    = src->info()->dimension(2);
-
-    const int output_stride_c = dst->info()->strides_in_bytes().x();
-
-    const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
-    const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
-    const int          kernel_dim_w    = weights->info()->dimension(1);
-    const int          kernel_dim_h    = weights->info()->dimension(2);
-
-    const int conv_pad_top  = _conv_info.pad_top();
-    const int conv_pad_left = _conv_info.pad_left();
-    const int conv_stride_w = std::get<0>(_conv_info.stride());
-    const int conv_stride_h = std::get<1>(_conv_info.stride());
-
-    // Setup input window for the output iterator
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Setup input window for the weights iterator
-    Window window_w = calculate_max_window(*weights->info(), Steps());
-    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
-    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    Iterator out(dst, window_out);
-    Iterator wei(weights, window_w);
-
-    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(0);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-            T out_temp = static_cast<T>(0);
-            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-            {
-                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
-                {
-                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                    int         index_c           = 0;
-                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_mover);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-        },
-        wei);
-    },
-    out);
-}
-
-BorderSize CpuDirectConv2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    _conv_info   = conv_info;
-    _data_layout = src->data_layout();
-    _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = conv_info.pad_right();
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    if(_data_layout == DataLayout::NCHW)
-    {
-        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-    }
-    else
-    {
-        _border_size = BorderSize(0);
-    }
-
-    // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    DataType data_type = src->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, data_type);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
-                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICpuKernel::configure(win_config.second);
-}
-
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
-    unsigned int num_weight_elems_read_per_row   = 0;
-    unsigned int num_elems_read_per_iteration    = 0;
-    unsigned int num_elems_written_per_iteration = 0;
-    BorderSize   border_size                     = {};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              weights->clone().get(),
-                                                              dst->clone().get(),
-                                                              conv_info,
-                                                              num_weight_elems_read_per_row,
-                                                              num_elems_read_per_iteration,
-                                                              num_elems_written_per_iteration,
-                                                              border_size)
-                                .first);
-
-    return Status{};
-}
-
-void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto      src         = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto      weights     = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto      dst         = tensors.get_tensor(TensorType::ACL_DST);
-    const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 3:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    case DataType::F16:
-                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            case 5:
-            {
-                switch(src->info()->data_type())
-                {
-                    case DataType::F32:
-                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Data type not supported");
-                        break;
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
-                break;
-            }
-        }
-    }
-    else
-    {
-        switch(src->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                if(have_zero_x_internal_padding(src->info(), weights->info()))
-                {
-                    convolve_nhwc_optimized<float>(window, src, weights, dst);
-                }
-                else
-                {
-                    convolve_nhwc<float>(window, src, weights, dst);
-                }
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-}
-const char *CpuDirectConv2dKernel::name() const
-{
-    return "CpuDirectConvolutionLayerKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
deleted file mode 100644
index 9bef1c484a..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConv2dKernel : public ICpuKernel
-{
-public:
-    CpuDirectConv2dKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
-    /** Set the src, weights, and dst tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *
-     * @param[in]  src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[out] dst       Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDirectConv2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-    BorderSize  border_size() const override;
-
-private:
-    /* Template function for optimized convolution NHWC */
-    template <typename T>
-    void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    /* Template function for convolution NHWC */
-    template <typename T>
-    void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
-    PadStrideInfo _conv_info{};
-    BorderSize    _border_size{};
-    unsigned int  _kernel_size{ 0 };
-    unsigned int  _num_weight_elems_read_per_row{ 0 };
-    unsigned int  _num_elems_read_per_iteration{ 0 };
-    unsigned int  _num_elems_written_per_iteration{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
deleted file mode 100644
index 662d052941..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                          const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-    }
-
-    if(src->data_type() == DataType::S32)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
-    }
-
-    // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        if(is_data_type_float(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-    else if(src->data_type() == DataType::S32)
-    {
-        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
-    }
-
-    return Status{};
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
-
-    },
-    in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
-}
-
-// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
-}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
-}
-} // namespace
-
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    _func                         = nullptr;
-    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
-    _result_shift                 = info.result_shift;
-    _result_offset_after_shift    = info.result_offset_after_shift;
-
-    // Auto-initialize output output if required
-    if(dst != nullptr)
-    {
-        // Work out expected output data type
-        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
-    }
-
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
-
-    // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nchw<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nchw<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nchw<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nchw<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-    else
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nhwc<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nhwc<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nhwc<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nhwc<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-}
-
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-    return Status{};
-}
-
-void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
-}
-
-const char *CpuDirectConv2dOutputStageKernel::name() const
-{
-    return "CpuDirectConv2dOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
deleted file mode 100644
index 749411c0a7..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
- *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class CpuDirectConv2dOutputStageKernel : public ICpuKernel
-{
-public:
-    CpuDirectConv2dOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
-     *                      Data type supported: F16/F32/S32
-     * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDirectConv2dOutputStageKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
deleted file mode 100644
index dc574fce65..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ElementwiseSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-
-using ElementwiseSelector = std::add_pointer<bool(const ElementwiseSelectorData &)>::type;
-using UKernelType         = CpuElementwiseKernel::ElementwiseFunction;
-struct ElementwiseKernel
-{
-    const char               *name;
-    const ElementwiseSelector is_selected;
-    UKernelType              *ukernel;
-};
-
-template <ArithmeticOperation     op>
-CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src1, dst);
-    static ElementwiseKernel kernels[] =
-    {
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "sve_fp32_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-            REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))
-        },
-        {
-            "sve_s32_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))
-        },
-        {
-            "sve_s16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-        {
-            "neon_fp32_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
-            REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))
-        },
-        {
-            "neon_s32_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-        {
-            "sve2_qu8_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))
-        },
-        {
-            "sve2_qs8_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "neon_qu8_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
-            REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))
-        },
-        {
-            "neon_qs8_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-            REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  || defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "sve_fp16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-            REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        {
-            "neon_fp16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-            REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))
-        },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-        {
-            "neon_s16_elementwise",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-    };
-
-    for(const auto &uk : kernels)
-    {
-        if(uk.is_selected({ src0->data_type(), CPUInfo::get() }))
-        {
-            return { uk.name, uk.ukernel };
-        }
-    }
-
-    return { "", nullptr };
-}
-
-template <ComparisonOperation     op>
-CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src1, dst);
-    static ElementwiseKernel kernels[] =
-    {
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "sve_u8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))
-        },
-        {
-            "sve_fp32_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-            REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))
-        },
-        {
-            "sve_s16_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))
-        },
-        {
-            "sve_s32_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); },
-            REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-        {
-            "neon_u8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))
-        },
-        {
-            "neon_fp32_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; },
-            REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))
-        },
-        {
-            "neon_s16_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))
-        },
-        {
-            "neon_s32_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; },
-            REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-        {
-            "sve2_qu8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))
-        },
-        {
-            "sve2_qs8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); },
-            REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "neon_qu8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; },
-            REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))
-        },
-        {
-            "neon_qs8_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-            REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON)  || defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-        {
-            "sve_fp16_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-            REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE)  */
-#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        {
-            "neon_fp16_comparison",
-            [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-            REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))
-        },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    };
-
-    for(const auto &uk : kernels)
-    {
-        if(uk.is_selected({ src0->data_type(), CPUInfo::get() }))
-        {
-            return { uk.name, uk.ukernel };
-        }
-    }
-
-    return { "", nullptr };
-}
-} // namespace
-
-Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for output");
-    }
-
-    return Status{};
-}
-
-void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    const auto uk = get_implementation(src0, src1, dst);
-
-    _run_method = uk.ukernel;
-    _name       = std::string("CpuElementwiseKernel").append("/").append(uk.name);
-
-    // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
-    {
-        return;
-    }
-
-    auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape());
-    auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
-    ICpuKernel::configure(shape_and_window.second);
-}
-
-void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src0, src1, dst, window);
-}
-
-const char *CpuElementwiseKernel::name() const
-{
-    return _name.c_str();
-}
-
-/** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    _op = op;
-    configure_common(src0, src1, dst);
-}
-
-Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
-    }
-    return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-CpuElementwiseKernel::UKernelInfo CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    switch(_op)
-    {
-        case ArithmeticOperation::MAX:
-            return configure_arithm_func<ArithmeticOperation::MAX>(src0, src1, dst);
-        case ArithmeticOperation::MIN:
-            return configure_arithm_func<ArithmeticOperation::MIN>(src0, src1, dst);
-        case ArithmeticOperation::SQUARED_DIFF:
-            return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(src0, src1, dst);
-        case ArithmeticOperation::PRELU:
-            return configure_arithm_func<ArithmeticOperation::PRELU>(src0, src1, dst);
-        case ArithmeticOperation::DIV:
-            return configure_arithm_func<ArithmeticOperation::DIV>(src0, src1, dst);
-        case ArithmeticOperation::POWER:
-            return configure_arithm_func<ArithmeticOperation::POWER>(src0, src1, dst);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return { "", nullptr };
-}
-
-/** The division operator */
-
-void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    _op = ArithmeticOperation::DIV;
-    configure_common(src0, src1, dst);
-}
-
-Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-/** The power operator */
-void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    _op = ArithmeticOperation::POWER;
-    configure_common(src0, src1, dst);
-}
-
-Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32);
-    return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
-    _op = op;
-    configure_common(src0, src1, dst);
-}
-
-Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
-    }
-    return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
-    return Status{};
-}
-
-CpuElementwiseKernel::UKernelInfo CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    switch(_op)
-    {
-        case ComparisonOperation::Equal:
-            return configure_comp_func<ComparisonOperation::Equal>(src0, src1, dst);
-        case ComparisonOperation::NotEqual:
-            return configure_comp_func<ComparisonOperation::NotEqual>(src0, src1, dst);
-        case ComparisonOperation::Greater:
-            return configure_comp_func<ComparisonOperation::Greater>(src0, src1, dst);
-        case ComparisonOperation::GreaterEqual:
-            return configure_comp_func<ComparisonOperation::GreaterEqual>(src0, src1, dst);
-        case ComparisonOperation::Less:
-            return configure_comp_func<ComparisonOperation::Less>(src0, src1, dst);
-        case ComparisonOperation::LessEqual:
-            return configure_comp_func<ComparisonOperation::LessEqual>(src0, src1, dst);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return { "", nullptr };
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
deleted file mode 100644
index 75137da65d..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
- *
- */
-class CpuElementwiseKernel : public ICpuKernel
-{
-public:
-    CpuElementwiseKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
-
-    using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
-    struct UKernelInfo
-    {
-        std::string                        name;
-        std::function<ElementwiseFunction> ukernel;
-    };
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-protected:
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] src1 Second tensor input. Data types supported: Same as @p src0.
-     * @param[in] dst  Output tensor. Data types supported: Dependent on subclass.
-     */
-    static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    virtual UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
-
-protected:
-    std::function<ElementwiseFunction> _run_method{ nullptr };
-    std::string                        _name{};
-};
-
-class CpuArithmeticKernel : public CpuElementwiseKernel
-{
-public:
-    CpuArithmeticKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  op   Arithmetic operation to be executed.
-     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuArithmeticKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-    ArithmeticOperation _op{};
-
-private:
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-};
-
-class CpuDivisionKernel : public CpuArithmeticKernel
-{
-public:
-    CpuDivisionKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  src0 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuDivisionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuPowerKernel : public CpuArithmeticKernel
-{
-public:
-    CpuPowerKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  src0 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: Same as @p src0.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPowerKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuComparisonKernel : public CpuElementwiseKernel
-{
-public:
-    CpuComparisonKernel() = default;
-
-    /** Configure kernel
-     *
-     * @param[in]  op   Comparison operation to be executed.
-     * @param[in]  src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in]  src1 Second tensor input info. Data types supported: Same as @p src0.
-     * @param[out] dst  Output tensor info. Data types supported: U8.
-     */
-    void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuComparisonKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-private:
-    /** Function to get the micro kernel implementation
-     *
-     * @param[in] src0 First input tensor information
-     * @param[in] src1 Second input tensor information
-     * @param[in] dst  Output tensor information
-     *
-     * @return the function instance for the micro kernel
-     */
-    UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-
-    ComparisonOperation _op{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
deleted file mode 100644
index b03c32f023..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ElementwiseUnarySelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using ElementwiseUnarySelector = std::add_pointer<bool(const ElementwiseUnarySelectorData &)>::type;
-
-struct ElementwiseUnaryKernel
-{
-    const char                                           *name;
-    const ElementwiseUnarySelector                        is_selected;
-    CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel;
-};
-
-static const ElementwiseUnaryKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>),
-    },
-    {
-        "sve_fp16_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>),
-    },
-    {
-        "sve_s32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
-    },
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>),
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>),
-    },
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_s32_elementwise_unary",
-        [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
-    },
-#endif // defined(ARM_COMPUTE_ENABLE_NEON)
-};
-
-const ElementwiseUnaryKernel *get_implementation(DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-} // namespace
-
-void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-    const auto uk = get_implementation(src.data_type());
-    ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    _op         = op;
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
-
-    // If input shape is dynamic, expect a configured window and dst at run-time.
-    if(src.is_dynamic())
-    {
-        return;
-    }
-
-    auto shape_and_window = compute_output_shape_and_window(src.tensor_shape());
-    auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
-    ICpuKernel::configure(shape_and_window.second);
-}
-
-Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-
-    const auto *uk = get_implementation(src.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    switch(op)
-    {
-        case ElementWiseUnary::EXP:
-        case ElementWiseUnary::RSQRT:
-        case ElementWiseUnary::LOG:
-        case ElementWiseUnary::ROUND:
-        case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
-            break;
-        case ElementWiseUnary::NEG:
-        case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
-    }
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-    }
-
-    return Status{};
-}
-
-void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window, _op);
-}
-
-const char *CpuElementwiseUnaryKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
deleted file mode 100644
index bda65a35e0..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x) = OP(src(x))@f]
- */
-class CpuElementwiseUnaryKernel : public ICpuKernel
-{
-public:
-    CpuElementwiseUnaryKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel);
-
-    /** Function to configure the @ref CpuElementwiseUnaryKernel
-     *
-     * @param[in]  op  Arithmetic operation to be executed.
-     * @param[in]  src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
-     * @param[out] dst Output tensor. Data types supported: Same as @p src.
-     */
-    void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuElementwiseUnaryKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-    /** Common signature for all the specialised elementwise unary micro-kernels
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
-
-private:
-    ElementWiseUnary           _op{};
-    ElementwiseUnaryUkernelPtr _run_method{ nullptr };
-    std::string                _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFillKernel.cpp b/src/core/cpu/kernels/CpuFillKernel.cpp
deleted file mode 100644
index aab4d715ee..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFillKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    _constant_value = constant_value;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*tensor, Steps());
-    ICpuKernel::configure(win);
-}
-
-void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST);
-
-    // Collapse all the batches on the third dimension
-    bool   has_collapsed = true;
-    Window collapsed     = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
-    ARM_COMPUTE_ERROR_ON(!has_collapsed);
-
-    uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor);
-    const auto     window_width       = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
-    const size_t   element_size       = inout->info()->element_size();
-
-    // Unroll X dimension
-    collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator tensor_it(inout, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + tensor_it.offset();
-        // Set memory
-        for(int i = 0; i < window_width; ++i)
-        {
-            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
-        }
-
-    },
-    tensor_it);
-}
-
-const char *CpuFillKernel::name() const
-{
-    return "CpuFillKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFillKernel.h b/src/core/cpu/kernels/CpuFillKernel.h
deleted file mode 100644
index 9afdee4186..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H
-#define ARM_COMPUTE_CPU_FILL_KERNEL_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel for filling a tensor with a given constant value */
-class CpuFillKernel : public ICpuKernel
-{
-public:
-    CpuFillKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in,out] tensor         Tensor to fill. Supported data types: All
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     */
-    void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PixelValue _constant_value{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp
deleted file mode 100644
index d41df6a1f5..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFloorKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/floor/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct FloorSelectorData
-{
-    DataType dt;
-};
-
-using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type;
-using FloorUKernelPtr  = std::add_pointer<void(const void *, void *, int)>::type;
-
-struct FloorUKernel
-{
-    const char            *name;
-    const FloorSelectorPtr is_selected;
-    FloorUKernelPtr        ukernel;
-};
-
-static const FloorUKernel available_kernels[] =
-{
-    {
-        "neon_fp16_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
-    },
-    {
-        "neon_fp32_floor",
-        [](const FloorSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const FloorUKernel *get_implementation(const FloorSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-    const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    // Validate in case of configured output
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
-
-    const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuFloorKernel").append("/").append(uk->name);
-
-    // Configure kernel window
-    const Window win = calculate_max_window(*src, Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(dst);
-    ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst)));
-
-    Window win;
-    win.use_tensor_dimensions(src->tensor_shape());
-    return win;
-}
-
-Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-    const auto     len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator src_it(src, win);
-    Iterator dst_it(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        _run_method(src_it.ptr(), dst_it.ptr(), len);
-    },
-    src_it, dst_it);
-}
-
-const char *CpuFloorKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h
deleted file mode 100644
index 78534d2a1d..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Cpu accelarated kernel to perform a floor operation */
-class CpuFloorKernel : public ICpuKernel
-{
-public:
-    CpuFloorKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Source tensor. Data type supported: F16/F32.
-     * @param[out] dst Destination tensor. Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuFloorKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-    /** Infer execution window
-     *
-     * @param[in] src Source tensor info. Data type supported: F16/F32.
-     * @param[in] dst Destination tensor info. Same as @p src
-     *
-     * @return an execution Window
-     */
-    Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
-
-private:
-    FloorUKernelPtr _run_method{ nullptr };
-    std::string     _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
deleted file mode 100644
index a6b080c0ab..0000000000
--- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_interleaved_shape(*src)));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst));
-
-    Window win = calculate_max_window(*src, Steps(1, 4));
-    ICPPKernel::configure(win);
-}
-
-Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = compute_interleaved_shape(*src);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-    /*
-    *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-    *         |a00 a01 a02 a03|
-    *         |a10 a11 a12 a13|
-    *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
-    *         |a30 a31 a32 a33|
-    *
-    *         After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
-    */
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    const size_t window_start_x = window.x().start();
-    const size_t window_end_x   = window.x().end();
-
-    const size_t in_height = src->info()->dimension(1);
-    const size_t in_stride = src->info()->strides_in_bytes()[1];
-
-    const size_t partial_y = in_height % 4;
-
-    const size_t element_size = src->info()->element_size();
-
-    // Set window for the src tensor
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Set window for the dst tensor
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.scale(Window::DimY, 0.25f);
-
-    Iterator in(src, win);
-    Iterator out(dst, win_out);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                size_t y = 0;
-                for(; y < partial_y; ++y)
-                {
-                    std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
-                }
-                for(; y < 4; ++y)
-                {
-                    std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
-                }
-            }
-        }
-    },
-    in, out);
-}
-
-const char *CpuGemmInterleave4x4Kernel::name() const
-{
-    return "CpuGemmInterleave4x4Kernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
deleted file mode 100644
index 0c55886d8d..0000000000
--- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to interleave the elements of a matrix
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class CpuGemmInterleave4x4Kernel : public ICpuKernel
-{
-public:
-    CpuGemmInterleave4x4Kernel() = default;
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  src Input tensor info. Data types supported: All
-     * @param[out] dst Output tensor info which stores the interleaved matrix. Data type supported: same as @p src.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmInterleave4x4Kernel
-     *
-     * Similar to @ref CpuGemmInterleave4x4Kernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
deleted file mode 100644
index 35e542faa4..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,1053 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
-{
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
-            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
-            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
-            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
-            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
-            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
-            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
-            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4x2_t a00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(a00_u8)),
-                    vget_high_u16(vmovl_u8(a00_u8))
-                }
-            };
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            const uint16x4x4_t b10_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
-                }
-            };
-
-            const uint16x4x4_t b20_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
-                }
-            };
-
-            const uint16x4x4_t b30_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
-                }
-            };
-
-            const uint16x4x4_t b40_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
-                }
-            };
-
-            const uint16x4x4_t b50_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
-                }
-            };
-
-            const uint16x4x4_t b60_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
-                }
-            };
-
-            const uint16x4x4_t b70_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
-{
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
-            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
-            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
-            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
-            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
-            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
-            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
-            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
-
-            // Convert a00_s8 to int16_t and get the lower part
-            const int16x4x2_t a00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(a00_s8)),
-                    vget_high_s16(vmovl_s8(a00_s8))
-                }
-            };
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            const int16x4x4_t b10_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
-                }
-            };
-
-            const int16x4x4_t b20_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
-                }
-            };
-
-            const int16x4x4_t b30_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
-                }
-            };
-
-            const int16x4x4_t b40_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
-                }
-            };
-
-            const int16x4x4_t b50_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
-                }
-            };
-
-            const int16x4x4_t b60_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
-                }
-            };
-
-            const int16x4x4_t b70_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, c0.val[0]);
-            vst1q_s32(vec_out + 4, c0.val[1]);
-            vst1q_s32(vec_out + 8, c0.val[2]);
-            vst1q_s32(vec_out + 12, c0.val[3]);
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
-{
-    const auto   width_out  = static_cast<int>(out_info.dimension(0));
-    const auto   height_out = static_cast<int>(out_info.dimension(1));
-    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 1
-        uint32x4x4_t c1 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        uint32x4x4_t c2 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        uint32x4x4_t c3 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
-            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Convert b00_s8 to uint16_t
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
-            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
-            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
-            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
-            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
-            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
-            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
-            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
-            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
-            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
-        }
-
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-                    if(id.y() + 3 < height_out)
-                    {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
-                    }
-                }
-            }
-        }
-        else
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                        {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
-                        }
-                    }
-                    if(id.y() + 3 < height_out)
-                    {
-                        left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
-                        {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                            {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
-{
-    const auto   width_out  = static_cast<int>(out_info.dimension(0));
-    const auto   height_out = static_cast<int>(out_info.dimension(1));
-    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
-        }
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-                    if(id.y() + 3 < height_out)
-                    {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
-                    }
-                }
-            }
-        }
-        else if(id.y() < height_out)
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                        {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
-                        }
-                    }
-                    if(id.y() + 3 < height_out)
-                    {
-                        left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
-                        {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                            {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-    },
-    ina, inb, out);
-}
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-
-    TensorShape in0_shape = src0->tensor_shape();
-    TensorShape in1_shape = src1->tensor_shape();
-    TensorShape out_shape = dst->tensor_shape();
-
-    // Check vector-by-matrix case
-    if(out_shape[1] == 1)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
-    }
-    else
-    {
-        in0_shape.collapse(2);
-        in1_shape.collapse(2);
-        out_shape.collapse(2);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(src0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst));
-
-    TensorShape in1_shape = src1->tensor_shape();
-    in1_shape.collapse(2);
-
-    _slide_matrix_b = in1_shape[2] != 1;
-
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-    Window win;
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((dst->dimension(1) == 1))
-    {
-        // Configure kernel window
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
-    }
-    else
-    {
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    }
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
-    return Status{};
-}
-
-void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
-    if((dst->info()->dimension(1) == 1))
-    {
-        const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
-        const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
-        const auto width_out      = static_cast<int>(dst->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
-
-        // The implementation computes 16 elements per iteration
-        const int window_start_x = 16 * info.thread_id;
-        const int window_step_x  = 16 * info.num_threads;
-        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-        Window win_out(window);
-        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Window win_a(window);
-        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        Window win_b;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(src1->info()->num_dimensions() >= 3)
-        {
-            win_b = window;
-        }
-        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator ina(src0, win_a);
-        Iterator inb(src1, win_b);
-        Iterator out(dst, win_out);
-
-        switch(src0->info()->data_type())
-        {
-            case DataType::S8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
-                break;
-            }
-            case DataType::U8:
-            case DataType::QASYMM8:
-            {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-            }
-        }
-    }
-    else
-    {
-        const size_t in_b_stride = src1->info()->strides_in_bytes()[1];
-        const int    width_b     = src1->info()->dimension(0);
-
-        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-        Window win_a(window);
-        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
-
-        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
-        Window win_b;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_slide_matrix_b)
-        {
-            win_b = window;
-        }
-        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
-        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-        // The step x and step y for the output matrix has been already set using in configure()
-        Iterator ina(src0, win_a);
-        Iterator inb(src1, win_b);
-        Iterator out(dst, window);
-
-        switch(src0->info()->data_type())
-        {
-            case DataType::S8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window);
-                break;
-            }
-            case DataType::U8:
-            case DataType::QASYMM8:
-            {
-                matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window);
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-                break;
-            }
-        }
-    }
-}
-
-const char *CpuGemmLowpMatrixMultiplyKernel::name() const
-{
-    return "CpuGemmLowpMatrixMultiplyKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
deleted file mode 100644
index 77d8741b19..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to multiply matrices
- *
- * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel
- *  This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuGemmLowpMatrixMultiplyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two
-     * kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in]  src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] dst  Output tensor info to store the result of matrix multiplication. Data type supported: S32
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpMatrixMultiplyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    bool _slide_matrix_b{ true };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
deleted file mode 100644
index 270abc8bbd..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info));
-    _k             = info.k;
-    _scalar        = info.scalar;
-    _mul_by_scalar = info.mul_by_scalar;
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-            _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32);
-
-    Window win = calculate_max_window(*dst, Steps(1));
-    ICpuKernel::configure(win);
-}
-
-Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
-    return Status{};
-}
-
-template <typename T>
-void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
-{
-    // Intermediate and final accumulator types
-    using TIAcc = wrapper::traits::promote_t<T>;
-    using TAcc  = wrapper::traits::promote_t<TIAcc>;
-
-    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window win_input(collapsed_window);
-    win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator in(src, win_input);
-    Iterator out(dst, collapsed_window);
-
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-        TAcc sum_row  = 0;
-
-        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
-
-#if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
-#endif /* __arm__ */
-
-        int i = 0;
-        // This for loop performs 16 accumulations
-        for(; i <= (_k - 16); i += 16)
-        {
-            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
-
-            // Partial accumulations in U16
-            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
-
-            // Accumulate to U32
-            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-        }
-
-        // This for loop performs the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            sum_row += static_cast<TAcc>(matrix_a[i]);
-        }
-
-#if defined(__aarch64__)
-        // Reduction operation available on 64 bit architectures only
-        sum_row += wrapper::vaddv(vsum_row);
-#else  // __aarch64__
-        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-        tmp      = wrapper::vpadd(tmp, tmp);
-
-        sum_row += wrapper::vgetlane(tmp, 0);
-#endif // __aarch64__
-
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_row *= _scalar;
-        }
-
-        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-    },
-    in, out);
-}
-
-void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    (this->*_func)(src, dst, window);
-}
-
-const char *CpuGemmLowpMatrixAReductionKernel::name() const
-{
-    return "CpuGemmLowpMatrixAReductionKernel";
-}
-
-void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
-
-    _k             = info.k;
-    _scalar        = info.scalar;
-    _mul_by_scalar = info.mul_by_scalar;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-            _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32);
-
-    // Configure kernel window
-    Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration));
-    ICpuKernel::configure(win);
-}
-
-Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
-    return Status{};
-}
-
-template <typename T>
-void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
-{
-    // Intermediate and final accumulator types
-    using TIAcc = wrapper::traits::promote_t<T>;
-    using TAcc  = wrapper::traits::promote_t<TIAcc>;
-
-    Window     collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-    const auto vec_scalar       = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
-    const auto width_matrix_b = static_cast<int>(src->info()->dimension(0));
-    const auto in_b_stride    = static_cast<int>(src->info()->strides_in_bytes()[1]);
-
-    // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * info.thread_id;
-    const int window_step_x  = 16 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-    Window win_out(collapsed_window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-
-    Window win_in(win_out);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator inb(src, win_in);
-    Iterator out(dst, win_out);
-
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
-        {
-            return;
-        }
-
-        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-        {
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-        };
-
-        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
-
-#if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
-#endif /* __arm__ */
-
-        int i = 0;
-        // This for loop performs 4 accumulations
-        for(; i <= (_k - 4); i += 4)
-        {
-            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
-#endif /* __arm__ */
-
-            // Partial accumulation in 16bit
-            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
-            {
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-            };
-
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
-            matrix_b += 4 * in_b_stride;
-        }
-
-        // This for loop perfoms the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-
-            // Convert S8 to S16
-            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
-            {
-                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                wrapper::vmovl(wrapper::vgethigh(b0_b8))
-            };
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
-
-            matrix_b += in_b_stride;
-        }
-
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-        }
-
-        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() + 16 < width_matrix_b)
-        {
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        }
-        else
-        {
-            auto left_over = width_matrix_b - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
-                }
-            }
-        }
-    },
-    inb, out);
-}
-
-void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    (this->*_func)(src, dst, window, info);
-}
-
-const char *CpuGemmLowpMatrixBReductionKernel::name() const
-{
-    return "CpuGemmLowpMatrixBReductionKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
deleted file mode 100644
index 106980fc0b..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-struct GEMMLowpReductionKernelInfo;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuGemmLowpMatrixAReductionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] dst  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info Kernel metadata:
-     *                            - k            (num_mtx_a_cols) Number of matrix A columns
-     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpMatrixAReductionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] src    Input tensor
-     * @param[in] dst    Output tensor
-     * @param[in] window Execution window
-     */
-    template <typename T>
-    void run_internal(const ITensor *src, ITensor *dst, const Window &window);
-
-    /** Common signature for all reduction functions
-     *
-     * @param[in]  src    Input tensor
-     * @param[out] dst    Output tensor
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-
-    CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
-};
-
-/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuGemmLowpMatrixBReductionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src  Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] dst  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info Kernel metadata:
-     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpMatrixBReductionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] src    Input tensor
-     * @param[in] dst    Output tensor
-     * @param[in] window Execution window
-     * @param[in] info   Thread-related information
-     */
-    template <typename T>
-    void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
-
-    /** Common signature for all reduction functions
-     *
-     * @param[in]  src    Input tensor
-     * @param[out] dst    Output tensor
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
-
-    CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
deleted file mode 100644
index 9b1bf08955..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-
-void run_offset_contribution(const Window &window,
-                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
-{
-    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
-    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
-    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16;
-
-    Iterator mm_result_it(mm_result, collapsed_window);
-
-    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
-        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id           = id.z() / depth_input;
-            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                // Add a_offset_term_s32 and b_offset_term_s32
-                int32x4x4_t offset_term_s32 =
-                {
-                    {
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset)
-                    }
-                };
-
-                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
-                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
-                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
-                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
-
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                a_offset_term_s32 *= a_offset;
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
-            }
-        },
-        vector_sum_col_it, vector_sum_row_it, mm_result_it);
-    }
-    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id      = id.z() / depth_input;
-            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += b_offset_term_s32;
-            }
-        },
-        vector_sum_row_it, mm_result_it);
-    }
-    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id           = id.z() / depth_input;
-            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
-            }
-        },
-        vector_sum_col_it, mm_result_it);
-    }
-    else // false, false
-    {
-        // No offset contribution from matrix A and matrix B
-        return;
-    }
-}
-} // namespace
-
-void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_UNUSED(vector_sum_row);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
-
-    _a_offset = a_offset;
-    _b_offset = b_offset;
-    _k_offset = a_offset * b_offset * k;
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                                                     int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
-    return Status{};
-}
-
-void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto mm_result      = tensors.get_tensor(TensorType::ACL_DST);
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
-}
-
-const char *CpuGemmLowpOffsetContributionKernel::name() const
-{
-    return "CpuGemmLowpOffsetContributionKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
deleted file mode 100644
index f23a46cde7..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CpuGemmLowpOffsetContributionKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuGemmLowpOffsetContributionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpOffsetContributionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    int32_t _a_offset{ 0 };
-    int32_t _b_offset{ 0 };
-    int32_t _k_offset{ 0 };
-    bool    _slide_vector_sum_col{ true };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index 332ce6f013..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
-{
-    return
-    {
-        {
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
-        }
-    };
-}
-
-inline int32x4x4_t load(const int32_t *ptr, int32_t x)
-{
-    return
-    {
-        {
-            vld1q_s32(ptr + x + 0),
-            vld1q_s32(ptr + x + 4),
-            vld1q_s32(ptr + x + 8),
-            vld1q_s32(ptr + x + 12)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
-{
-    return
-    {
-        {
-            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
-            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
-            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
-            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
-        }
-    };
-}
-
-inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
-{
-    int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
-
-    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-    return a_offset_term_s32;
-}
-
-inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset)
-{
-    int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr);
-    b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, b_offset);
-    return b_offset_term_s32;
-}
-
-inline int32x4x4_t get_k_offset(int32_t k_offset)
-{
-    return
-    {
-        {
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset)
-        }
-    };
-}
-
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to U8
-    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = vmaxq_u8(out_u8, min_u8);
-        out_u8 = vminq_u8(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
-
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-template <typename T>
-struct VectorTyper
-{
-    using stype = T;
-    using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>;
-};
-
-inline Window get_win_vector_sum(const Window &window)
-{
-    Window win_vector_sum(window);
-    win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    return win_vector_sum;
-}
-
-inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col)
-{
-    Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window));
-    return vector_sum_col_it;
-}
-
-inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row)
-{
-    Window win_vector_sum_row = get_win_vector_sum(window);
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-    return vector_sum_row_it;
-}
-
-inline Iterator get_bias_it(const Window &window, const ITensor *bias)
-{
-    Window win_bias(window);
-    win_bias.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1));
-    Iterator bias_it(bias, win_bias);
-    return bias_it;
-}
-
-template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
-                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
-                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
-{
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
-    {
-        // Combine quantization offset with other offsets.
-        offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
-    }
-    if(has_a_offset && has_b_offset)
-    {
-        offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
-    }
-    if(has_b_offset)
-    {
-        offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
-    }
-
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
-
-        if(has_a_offset)
-        {
-            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
-        }
-        if(has_bias)
-        {
-            in_s32 = add_s32(in_s32, load(bias_ptr, x));
-        }
-        if(!is_fixed_point || has_b_offset)
-        {
-            in_s32 = add_s32(in_s32, offset_term_s32);
-        }
-        if(!is_fixed_point)
-        {
-            in_s32 = mul_s32(in_s32, multiplier);
-        }
-
-        if(is_fixed_point)
-        {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
-        }
-        else
-        {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
-        }
-    }
-    // Compute left-over elements
-    for(; x < window_end_x; ++x)
-    {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
-
-        if(has_a_offset)
-        {
-            in_value += (*(vector_sum_col_ptr + x) * a_offset);
-        }
-        if(has_bias)
-        {
-            in_value += *(bias_ptr + x);
-        }
-
-        if(is_fixed_point)
-        {
-            // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
-                                                                                              static_cast<typename VT::stype>(min_bound),
-                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
-        }
-        else
-        {
-            // Finalize quantization
-            in_value = (in_value * multiplier) >> shift;
-
-            // Bound and store the result
-            if(is_bounded_relu)
-            {
-                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
-            }
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
-                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
-        }
-    }
-}
-
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
-                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
-                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
-{
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
-    {
-        // Combine quantization offset with other offsets.
-        offset_term_s32 = add_s32(offset_term_s32, result_offset);
-    }
-
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
-
-        if(has_a_offset)
-        {
-            in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
-        }
-        if(has_bias)
-        {
-            in_s32 = add_s32(in_s32, load(bias_ptr, x));
-        }
-        if(!is_fixed_point)
-        {
-            in_s32 = add_s32(in_s32, offset_term_s32);
-            in_s32 = mul_s32(in_s32, result_multipliers + x);
-        }
-
-        if(is_fixed_point)
-        {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
-        }
-        else
-        {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
-        }
-    }
-    // Compute left-over elements
-    for(; x < window_end_x; ++x)
-    {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
-
-        if(has_a_offset)
-        {
-            in_value += (*(vector_sum_col_ptr + x) * a_offset);
-        }
-        if(has_bias)
-        {
-            in_value += *(bias_ptr + x);
-        }
-
-        if(is_fixed_point)
-        {
-            // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
-        }
-        else
-        {
-            // Finalize quantization
-            in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
-
-            // Bound and store the result
-            if(is_bounded_relu)
-            {
-                in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
-            }
-            *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-        }
-    }
-}
-
-template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
-                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
-{
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-    using Typer        = VectorTyper<T>;
-
-    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
-    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int32_t multiplier = output_stage.gemmlowp_multiplier;
-    const int32_t shift      = output_stage.gemmlowp_shift;
-    const int32_t offset     = output_stage.gemmlowp_offset;
-    const int32_t min_bound  = output_stage.gemmlowp_min_bound;
-    const int32_t max_bound  = output_stage.gemmlowp_max_bound;
-
-    const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
-    const auto      min_vec           = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{});
-    const auto      max_vec           = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{});
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
-
-    Iterator mm_result_it(mm_result, win);
-    Iterator out_it(output, win);
-
-    if((a_offset != 0) && (b_offset != 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                   mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
-        }
-    }
-    else if((a_offset == 0) && (b_offset != 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
-
-        Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row);
-
-        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, mm_result_it, out_it);
-        }
-    }
-    else if((a_offset != 0) && (b_offset == 0))
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
-        }
-    }
-    else
-    {
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
-        }
-        return;
-    }
-}
-
-void run_offset_contribution_output_stage_symm(const Window &window,
-                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
-{
-    ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
-
-    const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
-
-    const int32_t offset    = output_stage.gemmlowp_offset;
-    const int32_t min_bound = output_stage.gemmlowp_min_bound;
-    const int32_t max_bound = output_stage.gemmlowp_max_bound;
-
-    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
-    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
-    const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
-    const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
-    const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
-
-    Iterator mm_result_it(mm_result, win);
-    Iterator out_it(output, win);
-
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
-
-        Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
-
-        // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
-        }
-    }
-    else
-    {
-        if(bias != nullptr)
-        {
-            Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
-        }
-        else
-        {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
-        }
-        return;
-    }
-}
-
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    if(output->data_type() != DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = output->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                               int32_t k, int32_t a_offset, int32_t b_offset,
-                                                               GEMMLowpOutputStageInfo output_stage)
-{
-    ARM_COMPUTE_UNUSED(vector_sum_row, bias);
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
-
-    _a_offset     = a_offset;
-    _b_offset     = b_offset;
-    _k_offset     = a_offset * b_offset * k;
-    _output_stage = output_stage;
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1;
-    }
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps());
-
-    // Note: This kernel performs 16 elements per iteration.
-    // However, since we use a left-over for loop, we cannot have any read or write out of memory
-    // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped
-    ICpuKernel::configure(win);
-}
-
-Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                                const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
-    return Status{};
-}
-
-void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto mm_result      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-    auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_3);
-    auto dst            = tensors.get_tensor(TensorType::ACL_DST);
-
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
-    int32_t type_min_int = type_min.get<int32_t>();
-    int32_t type_max_int = type_max.get<int32_t>();
-
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
-
-    // Check if we need to perform fixed point requantization
-    const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
-
-    // Check if symmetric per-channel execution
-    const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED;
-
-    // Check if symmetric per-channel execution
-    const bool is_symm = _output_stage.is_quantized_per_channel;
-
-    if(is_symm)
-    {
-        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-    }
-    else
-    {
-        if(is_signed)
-        {
-            run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-        }
-        else
-        {
-            run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
-                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
-        }
-    }
-}
-
-const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const
-{
-    return "CpuGemmLowpOffsetContributionOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 404f2c9496..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel.
- *
- * The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
- * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
- *
- * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
- *
- * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
- *
- * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * and mm_result'[i][k] = mm_result[i][k] +
- *                        (vector_sum_col[k] * a_offset) +
- *                        (vector_sum_row[i] * b_offset) +
- *                        (a_offset * b_offset * k)
- */
-
-class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuGemmLowpOffsetContributionOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel);
-    /** Initialise the kernel inputs and output.
-     *
-     * @param[in]  mm_result      Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]  vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A.
-     * @param[in]  bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] dst            Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  k              Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     */
-    void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
-                   int32_t b_offset,
-                   GEMMLowpOutputStageInfo output_stage);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t                 b_offset,
-                           GEMMLowpOutputStageInfo output_stage);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Function to use for the particular tensors passed to configure() */
-    int32_t                 _a_offset{ 0 };
-    int32_t                 _b_offset{ 0 };
-    int32_t                 _k_offset{ 0 };
-    bool                    _slide_vector_sum_col{ true };
-    GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index f1c797244a..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
-        }
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
-{
-    // Add the offset terms to GEMM's result
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
-
-    // Multiply by result_mult_int
-    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
-    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
-    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
-    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
-{
-    return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
-{
-    return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
-}
-
-template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
-                                                                                typename wrapper::traits::neon_vector<T, 16>::type max)
-{
-    // Shift final result (negative value shift right)
-    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8 or U8
-    typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
-
-    out = wrapper::vmax(out, min);
-    out = wrapper::vmin(out, max);
-
-    return out;
-}
-} // namespace
-
-template <typename T>
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
-{
-    using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
-
-    const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(-_output_stage->gemmlowp_shift);
-    const int       window_step_x     = 16;
-    const auto      window_start_x    = static_cast<int>(window.x().start());
-    const auto      window_end_x      = static_cast<int>(window.x().end());
-
-    const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest();
-    const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max();
-
-    VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
-    VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    if(bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias_i(bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
-                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, bias_i, out);
-    }
-    else
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, out);
-    }
-}
-
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  bias,
-                                                  dst,
-                                                  output_stage));
-
-    _output_stage = output_stage;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-
-    // Check if we need to clamp the result using min and max
-    _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
-                        && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                             && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
-    if(_output_stage->output_data_type == DataType::QASYMM8)
-    {
-        _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
-    }
-    else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
-    {
-        _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Data type not supported");
-    }
-}
-
-Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
-    return Status{};
-}
-
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, bias, dst, window);
-}
-
-const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const
-{
-    return "CpuGemmLowpQuantizeDownInt32ScaleKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index ca5e1b40fc..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel
-{
-public:
-    CpuGemmLowpQuantizeDownInt32ScaleKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ScaleKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src          Input tensor info. Data type supported: S32
-     * @param[in]  bias         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] dst          Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpQuantizeDownInt32ScaleKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Biases tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window())
-     */
-    template <typename T>
-    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ScaleKernel functions
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Biases tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    QuantizeDownFunctionPtr        _func{ nullptr };
-    const GEMMLowpOutputStageInfo *_output_stage{ nullptr };
-    bool                           _is_bounded_relu{ false };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 390e269cbb..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
-    }
-
-    return Status{};
-}
-} // namespace
-
-template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
-{
-    const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
-    const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_s16);
-    ARM_COMPUTE_UNUSED(max_s16);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win_collapsed);
-    Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x2_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
-
-                const int32x4x2_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out, bias_i);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x2_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
-
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-                ARM_COMPUTE_UNUSED(in_value);
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out);
-    }
-}
-
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_UNUSED(bias, dst);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
-
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*src, src->clone()->set_data_type(DataType::QSYMM16));
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
-}
-
-Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    return Status{};
-}
-
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (this->*_func)(src, bias, dst, window);
-}
-
-const char *CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::name() const
-{
-    return "CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index e360e65bae..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declaration
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel
-{
-public:
-    CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src                          Input tensor info. Data type supported: S32
-     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Bias tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Bias tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 318b6a06f8..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
-    }
-
-    return Status{};
-}
-} // namespace
-
-template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
-{
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
-    const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
-    const int8x16_t max_s8                        = vdupq_n_s8(static_cast<int8_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_s8, max_s8);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win_collapsed);
-    Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
-    }
-}
-
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
-
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _result_offset_after_shift    = result_offset_after_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8_SIGNED));
-
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= -128 && max >= 127);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
-}
-
-Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
-    return Status{};
-}
-
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (this->*_func)(src, bias, dst, window);
-}
-
-const char *CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::name() const
-{
-    return "CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 9c213abdf7..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declaration
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel
-{
-public:
-    CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src                          Input tensor info. Data type supported: S32
-     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Bias tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
-     *
-     * @param[in]  src    Input tensor info
-     * @param[in]  bias   Bias tensor info
-     * @param[out] dst    Output tensor info
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 6631a4fc67..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
-    }
-
-    return Status{};
-}
-} // namespace
-
-template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
-{
-    const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
-    const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
-    const uint8x16_t max_u8                        = vdupq_n_u8(static_cast<uint8_t>(_max));
-
-    ARM_COMPUTE_UNUSED(min_u8);
-    ARM_COMPUTE_UNUSED(max_u8);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win_collapsed);
-    Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
-    {
-        Window win_biases;
-        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-        Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
-    }
-    else
-    {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
-    }
-}
-
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int result_offset_after_shift, int min, int max)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max));
-
-    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
-    _result_shift                 = result_shift;
-    _result_offset_after_shift    = result_offset_after_shift;
-    _min                          = min;
-    _max                          = max;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    auto win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-
-    // Check if we need to clamp the result using min and max
-    const bool is_bounded_relu = !(min <= 0 && max >= 255);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
-}
-
-Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
-    return Status{};
-}
-
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    auto src  = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (this->*_func)(src, bias, dst, window);
-}
-
-const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() const
-{
-    return "CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index 13b30f3427..0000000000
--- a/src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declaration
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel
-{
-public:
-    CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  src                          Input tensor info. Data type supported: S32
-     * @param[in]  bias                         Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] dst                          Output tensor info. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Template function to run the CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
-
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
deleted file mode 100644
index da0f7b135e..0000000000
--- a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const float32x4_t beta_f32 = vdupq_n_f32(beta);
-
-    constexpr int window_step_x  = 16;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window.collapse_if_possible(window, Window::DimZ);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
-            const float32x4x4_t c        = vld4q_f32(in_ptr + x);
-
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-            alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-            alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-            alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
-
-            vst4q_f32(out_ptr + x, alpha_ab);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * beta;
-        }
-    },
-    in, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const float16x8_t beta_f16 = vdupq_n_f16(beta);
-
-    constexpr int window_step_x  = 16;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window.collapse_if_possible(window, Window::DimZ);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
-            const float16x8x2_t c        = vld2q_f16(in_ptr + x);
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-            alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
-
-            vst2q_f16(out_ptr + x, alpha_ab);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
-        }
-    },
-    in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-} // namespace
-
-void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta)
-{
-    ARM_COMPUTE_UNUSED(dst);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
-
-    _beta = beta;
-    switch(src->data_type())
-    {
-        case DataType::F32:
-            _func = &matrix_addition_f32;
-            break;
-        case DataType::F16:
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &matrix_addition_f16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    ICPPKernel::configure(win);
-}
-
-Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_UNUSED(beta);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-    return Status{};
-}
-
-void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_beta != 0.0f)
-    {
-        (*_func)(src, dst, window, _beta);
-    }
-}
-
-const char *CpuGemmMatrixAdditionKernel::name() const
-{
-    return "CpuGemmMatrixAdditionKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h
deleted file mode 100644
index f9450b962b..0000000000
--- a/src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
- *
- * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
- *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel
- *        - MTX_1 = C
- */
-class CpuGemmMatrixAdditionKernel : public ICpuKernel
-{
-public:
-    CpuGemmMatrixAdditionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixAdditionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]      src  Input tensor info (Matrix C). Data types supported: F16/F32
-     * @param[in, out] dst  Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref CpuGemmMatrixMultiplyKernel. Data type supported: the same as @p src.
-     * @param[in]      beta Weight of matrix C
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixAdditionKernel.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * Similar to @ref CpuGemmMatrixAdditionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the matrix addition functions
-     *
-     * @param[in]  src    An input tensor. Data types supported: F16/F32
-     * @param[out] dst    The output tensor. Data type supported: same as @p src
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  beta   Weight of matrix C
-     */
-    using MatrixAdditionFunctionPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window, float beta);
-    /** Matrix addition function to use for the particular tensor types passed to configure() */
-    MatrixAdditionFunctionPtr _func{ nullptr };
-    float                     _beta{ 0.f };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
deleted file mode 100644
index d86ea064de..0000000000
--- a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,1174 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
-    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
-
-    // The implementation computes 32 elements per iteration
-    const int window_start_x = 32 * info.thread_id;
-    const int window_step_x  = 32 * info.num_threads;
-    const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, win_out);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x8_t acc0 = vdupq_n_f16(0.f);
-            float16x8_t acc1 = vdupq_n_f16(0.f);
-            float16x8_t acc2 = vdupq_n_f16(0.f);
-            float16x8_t acc3 = vdupq_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-                matrix_b += 2 * in_b_stride;
-
-                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-                vec_a += 4;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t   a0  = *vec_a;
-                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f16(acc0, alpha_f16);
-                acc1 = vmulq_f16(acc1, alpha_f16);
-                acc2 = vmulq_f16(acc2, alpha_f16);
-                acc3 = vmulq_f16(acc3, alpha_f16);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            vst1q_f16(vec_out + 0, acc0);
-            vst1q_f16(vec_out + 8, acc1);
-            vst1q_f16(vec_out + 16, acc2);
-            vst1q_f16(vec_out + 24, acc3);
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x4_t vacc = vdup_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                const float16x4_t b_col =
-                {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
-
-                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t a0  = *vec_a;
-                const float16_t b00 = *matrix_b;
-
-                acc += b00 * a0;
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= static_cast<float16_t>(alpha);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            *(vec_out) = acc;
-        }
-    },
-    ina, inb, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
-    const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
-
-    // The implementation computes 16 elements per iteration
-    const int window_start_x = 16 * info.thread_id;
-    const int window_step_x  = 16 * info.num_threads;
-    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, win_out);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            float32x4_t acc0 = vdupq_n_f32(0.f);
-            float32x4_t acc1 = vdupq_n_f32(0.f);
-            float32x4_t acc2 = vdupq_n_f32(0.f);
-            float32x4_t acc3 = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                float32x2_t a0l = vld1_f32(vec_a);
-
-                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-#if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm__ */
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-
-                a0l = vld1_f32(vec_a);
-
-                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
-
-                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                acc0 = vmlaq_n_f32(acc0, b00, a0);
-                acc1 = vmlaq_n_f32(acc1, b01, a0);
-                acc2 = vmlaq_n_f32(acc2, b02, a0);
-                acc3 = vmlaq_n_f32(acc3, b03, a0);
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f32(acc0, alpha_f32);
-                acc1 = vmulq_f32(acc1, alpha_f32);
-                acc2 = vmulq_f32(acc2, alpha_f32);
-                acc3 = vmulq_f32(acc3, alpha_f32);
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            vst1q_f32(vec_out + 0, acc0);
-            vst1q_f32(vec_out + 4, acc1);
-            vst1q_f32(vec_out + 8, acc2);
-            vst1q_f32(vec_out + 12, acc3);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            float32x4_t vacc = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float32x4_t a0l = vld1q_f32(vec_a);
-
-                const float32x4_t b_col =
-                {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
-
-#if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm__ */
-
-                vacc = vmlaq_f32(vacc, b_col, a0l);
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
-
-                const float b00 = *matrix_b;
-
-                acc += b00 * a0;
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= alpha;
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            *vec_out = acc;
-        }
-    },
-    ina, inb, out);
-}
-
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride1          = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const size_t out_stride2          = out_stride1 * 2;
-    const size_t out_stride3          = out_stride1 * 3;
-    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix
-    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, window);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
-
-        float32x4_t acc00 = vdupq_n_f32(0.f);
-        float32x4_t acc10 = vdupq_n_f32(0.f);
-        float32x4_t acc20 = vdupq_n_f32(0.f);
-        float32x4_t acc30 = vdupq_n_f32(0.f);
-
-        float32x4_t acc01 = vdupq_n_f32(0.f);
-        float32x4_t acc11 = vdupq_n_f32(0.f);
-        float32x4_t acc21 = vdupq_n_f32(0.f);
-        float32x4_t acc31 = vdupq_n_f32(0.f);
-
-#if __arm__
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-        {
-            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0 = vld1q_dup_f32(mtx_a0 + 0);
-            a1 = vld1q_dup_f32(mtx_a0 + 1);
-            a2 = vld1q_dup_f32(mtx_a0 + 2);
-            a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-        {
-            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            mtx_a0 += 4;
-            mtx_b0 += 4;
-            mtx_b1 += 4;
-        }
-
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00 = vmulq_f32(acc00, alpha_f32);
-            acc10 = vmulq_f32(acc10, alpha_f32);
-            acc20 = vmulq_f32(acc20, alpha_f32);
-            acc30 = vmulq_f32(acc30, alpha_f32);
-            acc01 = vmulq_f32(acc01, alpha_f32);
-            acc11 = vmulq_f32(acc11, alpha_f32);
-            acc21 = vmulq_f32(acc21, alpha_f32);
-            acc31 = vmulq_f32(acc31, alpha_f32);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
-        const auto mtx_out1 = mtx_out0 + 4;
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            vst1q_f32(mtx_out1, acc01);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                vst1q_f32(mtx_out1 + out_stride1, acc11);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    vst1q_f32(mtx_out1 + out_stride2, acc21);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                        vst1q_f32(mtx_out1 + out_stride3, acc31);
-                    }
-                }
-            }
-        }
-        else if(id.x() < (out_width - 4))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                    }
-                }
-            }
-            // Left-over columns
-            const int columns_left = out_width - id.x() - 4;
-            for(auto x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out1 + x) = acc01[x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out1 + x + out_stride1) = acc11[x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out1 + x + out_stride2) = acc21[x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out1 + x + out_stride3) = acc31[x];
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out0 + x) = acc00[x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out0 + x + out_stride1) = acc10[x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out0 + x + out_stride2) = acc20[x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out0 + x + out_stride3) = acc30[x];
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
-{
-    ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride           = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
-
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
-    {
-        win_b = window;
-    }
-    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
-
-    Iterator ina(lhs, win_a);
-    Iterator inb(rhs, win_b);
-    Iterator out(dst, window);
-
-    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
-
-    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
-        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
-        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
-        float16x8x4_t c =
-        {
-            {
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f)
-            }
-        };
-
-        /*
-        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-             |a00 a01 a02 a03 | a04 a05 a06 a07|
-             |a10 a11 a12 a13 | a14 a15 a16 a17|
-             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
-             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
-             |a40 a41 a42 a43 | a44 a45 a46 a47|
-             |a50 a51 a52 a53 | a54 a55 a56 a57|
-             |a60 a61 a62 a63 | a64 a65 a66 a67|
-             |a70 a71 a72 a73 | a74 a75 a76 a77|
-
-             After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ]
-
-        B Matrix has been transposed as shown below
-
-           |b00 b01 b02 b03 b04 b05 b06 b07|
-           |b10 b11 b12 b13 b14 b15 b16 b17|
-           |b20 b21 b22 b23 b24 b25 b26 b27|
-           |b30 b31 b32 b33 b34 b35 b36 b37|
-          ------------------->
-
-           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
-
-            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
-            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
-
-        The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
-        */
-        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-
-        {
-            const float16x8_t p00 = vld1q_f16(mtx_a0);
-            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
-            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
-            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
-            mtx_a0 += 16;
-            mtx_b0 += 32;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-
-        {
-            const float16x4_t p00 = vld1_f16(mtx_a0);
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
-
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-        }
-
-        if(multiply_alpha)
-        {
-            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
-            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
-            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
-            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
-        }
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f16(mtx_out, c.val[0]);
-            if(id.y() + 1 < out_height)
-            {
-                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out + x) = c.val[0][x];
-                if(id.y() + 1 < out_height)
-                {
-                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
-                    if(id.y() + 2 < out_height)
-                    {
-                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
-                        if(id.y() + 3 < out_height)
-                        {
-                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
-                        }
-                    }
-                }
-            }
-        }
-    },
-    ina, inb, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
-
-    if(!is_interleaved)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
-
-        if(dst->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
-        }
-    }
-    else
-    {
-        const int m                         = reshape_info.m();
-        const int n                         = reshape_info.n();
-        const int k                         = reshape_info.k();
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        /* Interleave */
-        TensorShape tensor_shape0{ lhs->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        const TensorInfo tensor_info0          = lhs->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
-
-        if(n != 0) /* Transpose */
-        {
-            TensorShape tensor_shape1{ rhs->tensor_shape() };
-            tensor_shape1.set(0, n);
-            tensor_shape1.set(1, k);
-
-            const TensorInfo tensor_info1          = rhs->clone()->set_tensor_shape(tensor_shape1);
-            const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
-        }
-
-        if(dst->total_size() != 0)
-        {
-            if(n != 0)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
-            }
-            ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast<size_t>(m));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
-
-    // dst tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ lhs->tensor_shape() };
-    tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
-    tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
-
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
-
-    _alpha = alpha;
-
-    // Configure kernel window
-    Window win{};
-
-    // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    const bool is_dst_vector = (dst->dimension(1) == 1);
-    if(is_dst_vector)
-    {
-        const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
-
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
-    }
-    else
-    {
-        constexpr unsigned int num_elems_processed_per_iteration_x = 8;
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    }
-
-    switch(lhs->data_type())
-    {
-        case DataType::F32:
-        {
-            _func = (is_dst_vector) ? vector_matrix_multiply_f32 : matrix_matrix_multiply_f32;
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            _func = (is_dst_vector) ? vector_matrix_multiply_f16 : matrix_matrix_multiply_f16;
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-        }
-    }
-    ICPPKernel::configure(win);
-}
-
-Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved,
-                                             const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
-
-    return Status{};
-}
-
-void CpuGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const ITensor *lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    (*_func)(lhs, rhs, dst, window, info, _alpha);
-}
-
-const char *CpuGemmMatrixMultiplyKernel::name() const
-{
-    return "CpuGemmMatrixMultiplyKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
deleted file mode 100644
index 974ff85606..0000000000
--- a/src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
- *
- * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p lhs and @p rhs are both matrices and reshaped respectively with @ref CpuGemmInterleave4x4Kernel" and @ref CpuGemmTranspose1xWKernel
- * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped
- *
- */
-class CpuGemmMatrixMultiplyKernel : public ICpuKernel
-{
-public:
-    CpuGemmMatrixMultiplyKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixMultiplyKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If the output tensor is a matrix, the input matrices @p lhs and @p rhs should be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
-     *       These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  lhs            Left-handside tensor info containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  rhs            Right-handside tensor info containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                            If the output tensor is a vector, rhs must contain the matrix B not reshaped. Data type supported: same as @p lhs
-     * @param[out] dst            Output tensor to store the result of matrix multiplication. Data type supported: same as @p lhs.
-     * @param[in]  alpha          Weight of the matrix product
-     * @param[in]  is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
-     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
-     */
-    void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
-     *
-     * Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the matrix multiply functions
-     *
-     * @param[in]  lhs    Left-handside input tensor. Data types supported: F16/F32
-     * @param[in]  rhs    Right-handside input tensor. Data types supported: same as @p lhs
-     * @param[out] dst    The output tensor. Data type supported: same as @p rhs
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  info   Thread info metadata.
-     * @param[in]  alpha  Weight of the matrix product.
-     */
-    using GemmFunctionPtr = void(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
-    /** Matrix multiply function to use for the particular tensor types passed to configure() */
-    GemmFunctionPtr *_func{ nullptr };
-    float            _alpha{ 1.f };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
deleted file mode 100644
index 4b059f57cb..0000000000
--- a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuGemmTranspose1xWKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*src)));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmTranspose1xWKernel::validate(src, dst));
-
-    const size_t vector_size = 16 / src->element_size();
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(vector_size));
-    ICPPKernel::configure(win);
-}
-
-Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    /*
-     * Following an example of how the transposition1xW works when the src data type is F32
-     *
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * The dst matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
-     */
-
-    // Set window for dst tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    Iterator in(src, window);
-    Iterator out(dst, win_out);
-
-    const size_t in_width     = src->info()->dimension(0);
-    const size_t element_size = src->info()->element_size();
-    const size_t out_stride   = dst->info()->strides_in_bytes()[1];
-    const size_t vector_size  = 16 / element_size;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *in_ptr  = in.ptr();
-        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
-        for(size_t k = 0; k < vector_size; ++k)
-        {
-            // If the src width is not multiple of W, we fill the reference with 0s
-            if((id.x() + k) >= in_width)
-            {
-                std::memset(out_ptr + k * element_size, 0, element_size);
-            }
-            else
-            {
-                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
-            }
-        }
-    },
-    in, out);
-}
-
-const char *CpuGemmTranspose1xWKernel::name() const
-{
-    return "CpuGemmTranspose1xWKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h
deleted file mode 100644
index 1a9287f7b0..0000000000
--- a/src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H
-#define ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * Following an example of how the transposition1xW works when the input data type is F16
- *
- * @f[
- * \left( \begin{array}{cccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
- * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
- * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
- * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class CpuGemmTranspose1xWKernel : public ICpuKernel
-{
-public:
-    CpuGemmTranspose1xWKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmTranspose1xWKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Input tensor info. Data types supported: All
-     * @param[out] dst Output tensor info. Data type supported: same as @p src.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmTranspose1xWKernel
-     *
-     * Similar to @ref CpuGemmTranspose1xWKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.cpp b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
deleted file mode 100644
index ca6c9bfab4..0000000000
--- a/src/core/cpu/kernels/CpuIm2ColKernel.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <tuple>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
-
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
-    if(output->total_size() > 0)
-    {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-template <typename T, bool has_pads>
-inline void linearize_volume_nchw(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  top_left_x,
-                                  int                  top_left_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  kernel_depth,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_stride_x,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
-{
-    const int kernel_size2 = kernel_width * kernel_height;
-    const int x_e          = top_left_x + kernel_width * dilation_x;
-    const int y_e          = top_left_y + kernel_height * dilation_y;
-
-    // Linearize volume
-    int d = 0;
-    // This for loop linearize a volume with 3 slices. This allows:
-    // 1) to reduce the iterations of the outer for loop "d"
-    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    *(out_ptr + 0 * kernel_size2) = pad_value;
-                    *(out_ptr + 1 * kernel_size2) = pad_value;
-                    *(out_ptr + 2 * kernel_size2) = pad_value;
-                }
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *(out_ptr + 0 * kernel_size2) = pad_value;
-                        *(out_ptr + 1 * kernel_size2) = pad_value;
-                        *(out_ptr + 2 * kernel_size2) = pad_value;
-                    }
-                    else
-                    {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-        out_ptr += 2 * kernel_size2;
-    }
-
-    // Left over
-    for(; d < kernel_depth; d++)
-    {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
-        {
-            if((y < 0 || y >= input_h) && has_pads)
-            {
-                // All the values will be the offset (will be zeros when not quantized)
-                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
-                out_ptr += kernel_width;
-            }
-            else
-            {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
-                {
-                    if((x < 0 || x >= input_w) && has_pads)
-                    {
-                        *out_ptr = pad_value;
-                    }
-                    else
-                    {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                    }
-                }
-            }
-        }
-    }
-
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
-}
-
-template <typename T, bool has_pads>
-inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
-                                  T                   *out_ptr,
-                                  bool                 has_bias,
-                                  int                  start_x,
-                                  int                  start_y,
-                                  int                  kernel_width,
-                                  int                  kernel_height,
-                                  int                  input_w,
-                                  int                  input_h,
-                                  int                  input_c,
-                                  int                  input_stride_y,
-                                  int                  input_stride_z,
-                                  int                  pad_value,
-                                  int                  dilation_x,
-                                  int                  dilation_y)
-{
-    const int end_x        = start_x + kernel_width * dilation_x;
-    const int end_y        = start_y + kernel_height * dilation_y;
-    const int pad_quant    = kernel_width * input_c;
-    const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-            out_ptr += input_c * kernel_width;
-        }
-    }
-    else
-    {
-        for(int y = start_y; y < end_y; y += dilation_y)
-        {
-            if(y < 0 || y >= input_h)
-            {
-                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
-                out_ptr += pad_quant;
-            }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
-            {
-                for(int x = start_x; x < end_x; x += dilation_x)
-                {
-                    if(x < 0 || x >= input_w)
-                    {
-                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                    else
-                    {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
-                        out_ptr += input_c;
-                    }
-                }
-            }
-            else
-            {
-                //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
-                out_ptr += input_c * kernel_width;
-            }
-        }
-    }
-    // Append 1 if the convolution layer has biases
-    if(has_bias)
-    {
-        *out_ptr = static_cast<T>(1);
-    }
-}
-} // namespace
-
-template <typename T, bool has_pads, bool is_nchw>
-void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    const int input_w        = src->info()->dimension(width_idx);
-    const int input_h        = src->info()->dimension(height_idx);
-    const int input_c        = src->info()->dimension(channel_idx);
-    const int input_stride_x = src->info()->strides_in_bytes().x();
-    const int input_stride_y = src->info()->strides_in_bytes().y();
-    const int input_stride_z = src->info()->strides_in_bytes().z();
-    const int pad_left       = _conv_info.pad_left();
-    const int pad_top        = _conv_info.pad_top();
-    const int stride_x       = _conv_info.stride().first;
-    const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
-
-    Window window_in_out(window);
-    // The first three dimensions of the input and output are increased by the inner loops
-    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Create iterators
-    Iterator in(src, window_in_out);
-    Iterator out(dst, window_in_out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
-
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
-
-        // Linearize volume
-        if(is_nchw)
-        {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_w,
-                                               input_h,
-                                               input_c,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-    },
-    in, out);
-}
-
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    ARM_COMPUTE_UNUSED(num_groups);
-
-    _data_layout                   = src->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
-    _conv_info      = conv_info;
-    _kernel_width   = kernel_dims.width;
-    _kernel_height  = kernel_dims.height;
-    _dilation       = dilation;
-    _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
-                                        _kernel_width, _kernel_height,
-                                        _conv_info, _dilation);
-    _has_bias = has_bias;
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        switch(src->data_type())
-        {
-            case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
-                break;
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-            case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
-                break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QASYMM8_SIGNED:
-            case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-    else
-    {
-        switch(src->data_type())
-        {
-            case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
-                break;
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-            case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
-                break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
-                break;
-            case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-        }
-    }
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false)));
-
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
-
-    Window win = calculate_max_window(*src, Steps());
-    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
-    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
-    win.set(channel_idx, Window::Dimension(0, 1, 1));
-    // Configure kernel window
-    ICpuKernel::configure(win);
-}
-
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                 bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    return Status{};
-}
-
-void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, dst, window);
-}
-const char *CpuIm2ColKernel::name() const
-{
-    return "CpuIm2ColKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.h b/src/core/cpu/kernels/CpuIm2ColKernel.h
deleted file mode 100644
index ffac5077b2..0000000000
--- a/src/core/cpu/kernels/CpuIm2ColKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_IM2COL_KERNEL_H
-#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CpuIm2ColKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuIm2ColKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  src         The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[out] dst         The output tensor info. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuIm2ColKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Template function to run im2col
-     *
-     * @param[in]  src    The input tensor info
-     * @param[out] dst    The output tensor info
-     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const ITensor *src, ITensor *dst, const Window &window);
-
-    /** Common signature for all the specialised im2col functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-
-    Im2ColFunctionPtr _func{ nullptr };
-    std::pair<unsigned int, unsigned int> _convolved_dims{};
-    PadStrideInfo _conv_info{};
-    unsigned int  _kernel_width{ 0 };
-    unsigned int  _kernel_height{ 0 };
-    bool          _has_bias{ false };
-    Size2D        _dilation{ 1U, 1U };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_IM2COL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuMulKernel.cpp b/src/core/cpu/kernels/CpuMulKernel.cpp
deleted file mode 100644
index 82ec322875..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.cpp
+++ /dev/null
@@ -1,1729 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuMulKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-const float       scale255_constant      = 1.f / 255.f;
-const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
-const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
-
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_UNUSED(overflow_policy);
-    ARM_COMPUTE_UNUSED(rounding_policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16,
-                                                         DataType::S32, DataType::F16, DataType::F32);
-    if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
-    }
-
-    if(dst->total_size() > 0)
-    {
-        const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-        // clang-format off
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
-            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
-            !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
-            , "Invalid data type combination");
-        // clang-format on
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
-    }
-
-    if(std::abs(scale - scale255_constant) < 0.00001f)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
-                                        "Scale == 1/255 is not supported if input and dst are of data type S32");
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
-
-        int         exponent            = 0;
-        const float normalized_mantissa = std::frexp(scale, &exponent);
-
-        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-        // Moreover, it will be negative as we deal with 1/2^n
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
-    }
-
-    return Status{};
-}
-
-/* Scales a given vector by 1/255.
- *
- * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
- *
- * @param in Input vector to scale.
- * @return   Scaled dst rounded to nearest (round half up).
- */
-inline int32x4_t scale255_S32_S32(int32x4_t in)
-{
-    // Scale
-    const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
-    // Round to nearest (round half up)
-    // Add +0.5 for all values
-    // Afterwards vcvt rounds toward zero
-    return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
-}
-
-inline uint16x8_t scale255_U16_U16(uint16x8_t in)
-{
-    const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
-    const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
-    return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
-    return vquantize_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
-    return vquantize(val, info);
-}
-
-template <typename T>
-void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16 / sizeof(T);
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
-
-                const float32x4x4_t out_f32x4x4 =
-                {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(non_broadcast_input_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto input1_q = wrapper::vloadq(input1_ptr + x);
-                const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
-                const float32x4x4_t out_f32x4x4 =
-                {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(input1_ptr + x);
-                const T     src2    = *(input2_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
-    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const qsymm16x8x2_t input1_q =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-
-            // Dequantize inputs
-            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
-            const float32x4x4_t out_f32x4x4 =
-            {
-                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-            };
-
-            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Dequantize inputs
-            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
-            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
-            float tmp_f   = tmp_in1 * tmp_in2;
-
-            // Quantize dst, lrintf() has same rounding mode as vcombine_s16
-            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
-            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            *(output_ptr + x) = tmp_qua;
-        }
-    },
-    input1, input2, dst);
-}
-
-void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
-{
-    ARM_COMPUTE_UNUSED(scale);
-
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const qsymm16x8x2_t input1_q =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-
-            const int32x4x4_t in1_s32 =
-            {
-                {
-                    vmovl_s16(vget_low_s16(input1_q.val[0])),
-                    vmovl_s16(vget_high_s16(input1_q.val[0])),
-                    vmovl_s16(vget_low_s16(input1_q.val[1])),
-                    vmovl_s16(vget_high_s16(input1_q.val[1])),
-                }
-            };
-            const int32x4x4_t in2_s32 =
-            {
-                {
-                    vmovl_s16(vget_low_s16(input2_q.val[0])),
-                    vmovl_s16(vget_high_s16(input2_q.val[0])),
-                    vmovl_s16(vget_low_s16(input2_q.val[1])),
-                    vmovl_s16(vget_high_s16(input2_q.val[1])),
-                }
-            };
-
-            const int32x4x4_t result =
-            {
-                {
-                    vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
-                    vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
-                    vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
-                    vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-                }
-            };
-
-            vst1q_s32(output_ptr + x, result.val[0]);
-            vst1q_s32(output_ptr + x + 4, result.val[1]);
-            vst1q_s32(output_ptr + x + 8, result.val[2]);
-            vst1q_s32(output_ptr + x + 12, result.val[3]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16 / sizeof(uint8_t);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
-            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
-
-            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
-
-            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
-
-            if(is_scale255)
-            {
-                tmp1_high = scale255_U16_U16(tmp1_high);
-                tmp1_low  = scale255_U16_U16(tmp1_low);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
-
-                if(is_sat)
-                {
-                    tmp1_high = vqshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
-                }
-                else
-                {
-                    tmp1_high = vshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vshlq_u16(tmp1_low, vn);
-                }
-            }
-            if(is_sat)
-            {
-                vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-            }
-            else
-            {
-                vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-            }
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-            if(is_sat && tmp > 255)
-            {
-                tmp = 255;
-            }
-            *(output_ptr + x) = static_cast<uint8_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
-{
-    int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(src1));
-    const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
-    int32x4_t       tmp1_low  = vmovl_s16(vget_low_s16(src1));
-    const int32x4_t tmp2_low  = vmovl_s16(vget_low_s16(src2));
-
-    tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
-    tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
-
-    if(is_scale255)
-    {
-        tmp1_high = scale255_S32_S32(tmp1_high);
-        tmp1_low  = scale255_S32_S32(tmp1_low);
-    }
-    else
-    {
-        // Right shift amount
-        const int32x4_t vn = vdupq_n_s32(-n);
-        // Left shift amount
-        const int32x4_t vnl = vdupq_n_s32(n);
-        // Calculate conversion bit
-        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
-        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
-        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
-        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
-        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
-        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
-        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
-        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
-        if(is_sat)
-        {
-            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
-            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
-        }
-        else
-        {
-            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
-            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
-        }
-    }
-
-    if(is_sat)
-    {
-        return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
-    }
-    else
-    {
-        return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
-    }
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
-{
-    const int16x8x2_t result =
-    {
-        {
-            // First 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
-            // Second 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
-
-    return result;
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
-                }
-            }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool   is_sat>
-inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
-{
-    const int32x2_t input1_1 = vget_low_s32(src1);
-    const int32x2_t input2_1 = vget_low_s32(src2);
-    const int32x2_t input1_2 = vget_high_s32(src1);
-    const int32x2_t input2_2 = vget_high_s32(src2);
-
-    int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
-    int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
-
-    // Apply scaling, conversion and rounding (round to zero)
-    // Right shift amount
-    const int64x2_t vn = vdupq_n_s64(-n);
-    // Left shift amount
-    const int64x2_t vnl = vdupq_n_s64(n);
-    // Calculate conversion bit
-    const uint64x2_t tmp_1_u   = vreinterpretq_u64_s64(tmp_1);
-    const uint64x2_t sign_1    = vshrq_n_u64(tmp_1_u, 63);
-    const int64x2_t  sign_1_s  = vreinterpretq_s64_u64(sign_1);
-    const int64x2_t  convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
-
-    const uint64x2_t tmp_2_u   = vreinterpretq_u64_s64(tmp_2);
-    const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
-    const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
-    const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
-    if(is_sat)
-    {
-        tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
-        tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
-        return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
-    }
-    else
-    {
-        tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
-        tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
-        return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
-    }
-}
-
-template <bool     is_sat>
-inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
-{
-    const int32x4x2_t result =
-    {
-        {
-            // First 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
-            // Second 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
-
-    return result;
-}
-
-template <bool is_sat>
-void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
-
-            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t broadcast_v =
-                {
-                    {
-                        broadcast_value_vec,
-                        broadcast_value_vec,
-                    }
-                };
-                const int32x4x2_t non_broadcast_v =
-                {
-                    {
-                        vld1q_s32(non_broadcast_input_ptr + x),
-                        vld1q_s32(non_broadcast_input_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t ta1 =
-                {
-                    {
-                        vld1q_s32(input1_ptr + x),
-                        vld1q_s32(input1_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t ta2 =
-                {
-                    {
-                        vld1q_s32(input2_ptr + x),
-                        vld1q_s32(input2_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(float);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 8 / sizeof(float);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
-    using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
-                float32x4_t b = vdupq_n_f32(broadcast_value);
-
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
-                float32x4_t res = wrapper::vmul(tmp0, b);
-                b               = wrapper::vmul(b, mask);
-
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
-                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
-                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
-                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
-                *(output_ptr + 2 * x)           = res1;
-                *(output_ptr + 2 * x + 1)       = res2;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
-                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
-
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
-                float32x4_t res = wrapper::vmul(tmp0, b);
-
-                b = wrapper::vrev64(b);
-                b = wrapper::vmul(b, mask);
-
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto a0             = *(input1_ptr + 2 * x);
-                const auto a1             = *(input1_ptr + 2 * x + 1);
-                const auto b0             = *(input2_ptr + 2 * x);
-                const auto b1             = *(input2_ptr + 2 * x + 1);
-                auto       res1           = a0 * b0 - a1 * b1;
-                auto       res2           = a0 * b1 + a1 * b0;
-                *(output_ptr + 2 * x)     = res1;
-                *(output_ptr + 2 * x + 1) = res2;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    constexpr int window_step_x         = 16;
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator dst(out, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto          non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-            const auto          output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-            const auto          broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-            const float16x8x2_t broadcast_value_vec =
-            {
-                {
-                    vdupq_n_f16(broadcast_value),
-                    vdupq_n_f16(broadcast_value),
-                }
-            };
-            const auto scale_vec = vdupq_n_f16(scale);
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t non_broadcast_v =
-                {
-                    {
-                        vld1q_f16(non_broadcast_input_ptr + x),
-                        vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t result =
-                {
-                    {
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
-    }
-    else
-    {
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        Iterator input1(src1, input1_win);
-        Iterator input2(src2, input2_win);
-        Iterator dst(out, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t ta1 =
-                {
-                    {
-                        vld1q_f16(input1_ptr + x),
-                        vld1q_f16(input1_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t ta2 =
-                {
-                    {
-                        vld1q_f16(input2_ptr + x),
-                        vld1q_f16(input2_ptr + x + 8),
-                    }
-                };
-                const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                const float16x8x2_t result =
-                {
-                    {
-                        vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
-    }
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16 / sizeof(uint8_t);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
-            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
-            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
-
-            if(is_scale255)
-            {
-                tmp_low  = scale255_U16_U16(tmp_low);
-                tmp_high = scale255_U16_U16(tmp_high);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
-
-                if(is_sat)
-                {
-                    tmp_low  = vqshlq_u16(tmp_low, vn);
-                    tmp_high = vqshlq_u16(tmp_high, vn);
-                }
-                else
-                {
-                    tmp_low  = vshlq_u16(tmp_low, vn);
-                    tmp_high = vshlq_u16(tmp_high, vn);
-                }
-            }
-
-            if(is_sat)
-            {
-                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
-
-                tmp_low  = vminq_u16(tmp_low, max);
-                tmp_high = vminq_u16(tmp_high, max);
-            }
-
-            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
-            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
-            }
-
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src1, input1_win);
-    Iterator input2(src2, input2_win);
-    Iterator dst(out, win);
-
-    const int  window_step_x  = 16;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const uint8x8x2_t ta2u =
-            {
-                {
-                    vld1_u8(input2_ptr + x),
-                    vld1_u8(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-                }
-            };
-
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
-                }
-            }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
-    // Simply swap the two input buffers
-    mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
-}
-} // namespace
-
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_UNUSED(rounding_policy);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    set_shape_if_empty(*dst, out_shape);
-
-    _scale          = scale;
-    _scale_exponent = 0;
-    _func_quantized = nullptr;
-    _func_int       = nullptr;
-    _func_float     = nullptr;
-
-    bool is_scale_255 = false;
-    // Check and validate scaling factor
-    if(std::abs(scale - scale255_constant) < 0.00001f)
-    {
-        is_scale_255 = true;
-    }
-    else
-    {
-        int exponent = 0;
-
-        std::frexp(scale, &exponent);
-
-        // Store the positive exponent. We know that we compute 1/2^n
-        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-        _scale_exponent = std::abs(exponent - 1);
-    }
-
-    const DataType dt_input1 = src1->data_type();
-    const DataType dt_input2 = src2->data_type();
-    const DataType dt_output = dst->data_type();
-    const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
-
-    switch(dt_input1)
-    {
-        case DataType::QASYMM8:
-            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
-            {
-                _func_quantized = &mul_saturate_quantized_8<uint8_t>;
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(dt_input2 == DataType::QASYMM8_SIGNED)
-            {
-                _func_quantized = &mul_saturate_quantized_8<int8_t>;
-                ;
-            }
-            break;
-        case DataType::QSYMM16:
-            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
-            {
-                _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
-            }
-            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
-            {
-                _func_int = &mul_QSYMM16_QSYMM16_S32;
-            }
-            break;
-        case DataType::S16:
-            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
-                }
-            }
-            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
-                }
-            }
-            break;
-        case DataType::S32:
-            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
-            {
-                _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
-            }
-            break;
-        case DataType::U8:
-            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
-                }
-            }
-            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
-                }
-            }
-            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-            {
-                if(is_scale_255)
-                {
-                    _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
-                }
-                else
-                {
-                    _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
-                }
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func_float = &mul_F16_F16_F16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            _func_float = &mul_F32_F32_F32;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("You called with the wrong img formats");
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(out_shape);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
-                              RoundingPolicy rounding_policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
-    return Status{};
-}
-
-void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    if(_func_quantized != nullptr)
-    {
-        (*_func_quantized)(src1, src2, dst, window, _scale);
-    }
-    else if(_func_int != nullptr)
-    {
-        (*_func_int)(src1, src2, dst, window, _scale_exponent);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        (*_func_float)(src1, src2, dst, window, _scale);
-    }
-}
-const char *CpuMulKernel::name() const
-{
-    return "CpuMulKernel";
-}
-namespace
-{
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
-    auto_init_if_empty(*dst, out_info);
-
-    // Configure kernel window
-    Window win = calculate_max_window(out_shape);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
-
-    return Status{};
-}
-
-void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    c_mul_F32_F32_F32_n(src1, src2, dst, window);
-}
-
-const char *CpuComplexMulKernel::name() const
-{
-    return "CpuComplexMulKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuMulKernel.h b/src/core/cpu/kernels/CpuMulKernel.h
deleted file mode 100644
index 3ea176cc31..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H
-#define ARM_COMPUTE_CPU_MUL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform multiplication between two tensors */
-class CpuMulKernel : public ICpuKernel
-{
-public:
-    CpuMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel);
-    /** Initialise the kernel's input, dst and border mode.
-     *
-     * Valid configurations (Src1,Src2) -> Dst :
-     *
-     *                                                       Support: Broadcast? Scale=1/255?
-     *   - (U8,U8)                         -> U8, S16                 N          Y
-     *   - (U8,S16)                        -> S16                     N          Y
-     *   - (S16,U8)                        -> S16                     N          Y
-     *   - (S16,S16)                       -> S16                     N          Y
-     *   - (S32,S32)                       -> S32                     Y          N
-     *   - (F16,F16)                       -> F16                     N          Y
-     *   - (F32,F32)                       -> F32                     Y          Y
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in]  src1            First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[in]  src2            Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[out] dst             Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     *                             If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
-     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
-     * @param[in]  rounding_policy Rounding policy.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
-    // Inherited methods overridden
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the specialised multiplication functions with integer scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Integer scale factor.
-     */
-    using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
-    /** Common signature for all the specialised multiplication functions with float scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Float scale factor.
-     */
-    using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
-     *
-     * @param[in]  src1   Src1 tensor object.
-     * @param[in]  src2   Src2 tensor object.
-     * @param[out] dst    Dst tensor object.
-     * @param[in]  window Region on which to execute the kernel
-     * @param[in]  scale  Float scale factor.
-     *
-     */
-    using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-
-    MulFunctionFloat     *_func_float{ nullptr };
-    MulFunctionInt       *_func_int{ nullptr };
-    MulFunctionQuantized *_func_quantized{ nullptr };
-    float                 _scale{ 0 };
-    int                   _scale_exponent{ 0 };
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CpuComplexMulKernel : public ICpuKernel
-{
-public:
-    CpuComplexMulKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel);
-    /** Initialise the kernel's src, dst and border mode.
-     *
-     * @param[in]  src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in]  src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[out] dst  The dst tensor, Data types supported: same as @p src1.  Number of channels supported: same as @p src1.
-     */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuComplexMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.cpp b/src/core/cpu/kernels/CpuPermuteKernel.cpp
deleted file mode 100644
index 270d6e222e..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPermuteKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace
-{
-#include "src/core/NEON/kernels/convolution/common/shims.hpp"
-} // namespace
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-inline bool is_permutation_supported(const PermutationVector &v)
-{
-    static const std::array<PermutationVector, 2> permutations2 =
-    {
-        {
-            PermutationVector(0U, 1U),
-            PermutationVector(1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 6> permutations3 =
-    {
-        {
-            PermutationVector(2U, 0U, 1U),
-            PermutationVector(1U, 2U, 0U),
-            PermutationVector(0U, 1U, 2U),
-            PermutationVector(0U, 2U, 1U),
-            PermutationVector(1U, 0U, 2U),
-            PermutationVector(2U, 1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 24> permutations4 =
-    {
-        {
-            PermutationVector(0U, 1U, 2U, 3U),
-            PermutationVector(1U, 0U, 2U, 3U),
-            PermutationVector(2U, 0U, 1U, 3U),
-            PermutationVector(0U, 2U, 1U, 3U),
-            PermutationVector(1U, 2U, 0U, 3U),
-            PermutationVector(2U, 1U, 0U, 3U),
-            PermutationVector(2U, 1U, 3U, 0U),
-            PermutationVector(1U, 2U, 3U, 0U),
-            PermutationVector(3U, 2U, 1U, 0U),
-            PermutationVector(2U, 3U, 1U, 0U),
-            PermutationVector(1U, 3U, 2U, 0U),
-            PermutationVector(3U, 1U, 2U, 0U),
-            PermutationVector(3U, 0U, 2U, 1U),
-            PermutationVector(0U, 3U, 2U, 1U),
-            PermutationVector(2U, 3U, 0U, 1U),
-            PermutationVector(3U, 2U, 0U, 1U),
-            PermutationVector(0U, 2U, 3U, 1U),
-            PermutationVector(2U, 0U, 3U, 1U),
-            PermutationVector(1U, 0U, 3U, 2U),
-            PermutationVector(0U, 1U, 3U, 2U),
-            PermutationVector(3U, 1U, 0U, 2U),
-            PermutationVector(1U, 3U, 0U, 2U),
-            PermutationVector(0U, 3U, 1U, 2U),
-            PermutationVector(3U, 0U, 1U, 2U)
-        }
-    };
-
-    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
-           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
-
-    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-
-    // Validate configured destination
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-template <typename T>
-void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm)
-{
-    const DataLayout src_layout = src->info()->data_layout();
-
-    // Source window
-    Window window_src = window;
-
-    // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
-    // we have to fall back to C++
-    if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
-    {
-        window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-        window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
-        window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
-    }
-
-    // Destination window
-    Window                  window_dst(window);
-    const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
-    {
-        window_dst.set(d, zero_window);
-    }
-
-    // Create iterators
-    Iterator src_it(src, window_src);
-    Iterator dst_it(dst, window_dst);
-
-    int in_row_stride     = 0;
-    int in_col_stride     = 0;
-    int in_channel_stride = 0;
-    int in_batch_stride   = 0;
-    int n_cols            = 0;
-    int n_rows            = 0;
-    int n_channels        = 0;
-    int n_batches         = 0;
-
-    switch(src_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            in_row_stride     = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride   = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_cols            = src->info()->tensor_shape().x();
-            n_rows            = window_src.y().step();
-            n_channels        = src->info()->tensor_shape().z();
-            n_batches         = src->info()->tensor_shape()[3];
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            in_col_stride   = src->info()->strides_in_bytes().y() / sizeof(T);
-            in_row_stride   = src->info()->strides_in_bytes().z() / sizeof(T);
-            in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
-            n_channels      = src->info()->tensor_shape().x();
-            n_cols          = window_src.y().step();
-            n_rows          = src->info()->tensor_shape().z();
-            n_batches       = src->info()->tensor_shape()[3];
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid source data layout.");
-            break;
-        }
-    }
-
-    // CHW -> HWC
-    if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
-    {
-        const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
-        const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
-        const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
-        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
-            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_channels, n_rows, n_cols,
-                                  in_batch_stride, in_channel_stride, in_row_stride,
-                                  out_batch_stride, out_row_stride, out_col_stride);
-        },
-        src_it, dst_it);
-    }
-    // HWC -> CHW
-    else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
-    {
-        const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
-        const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
-        const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
-        const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
-            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_rows, n_cols, n_channels,
-                                  in_batch_stride, in_row_stride, in_col_stride,
-                                  out_batch_stride, out_channel_stride, out_row_stride);
-        },
-        src_it, dst_it);
-    }
-    else
-    {
-        // All other cases fall back to C++
-        // Permute strides
-        Strides strides      = dst->info()->strides_in_bytes();
-        Strides perm_strides = strides;
-        permute_strides(perm_strides, perm);
-        const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                                = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
-            *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
-        },
-        src_it, dst_it);
-    }
-}
-} // namespace
-
-void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-    // Destination auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
-
-    _perm = perm;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    // This kernel doesn't need padding so update_window_and_padding() can be skipped
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
-    return Status{};
-}
-
-void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->element_size())
-    {
-        case 1:
-            run_permute<uint8_t>(window, src, dst, _perm);
-            break;
-        case 2:
-            run_permute<uint16_t>(window, src, dst, _perm);
-            break;
-        case 4:
-            run_permute<uint32_t>(window, src, dst, _perm);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-}
-
-const char *CpuPermuteKernel::name() const
-{
-    return "CpuPermuteKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.h b/src/core/cpu/kernels/CpuPermuteKernel.h
deleted file mode 100644
index 2955f38960..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform tensor permutation given a permutation vector */
-class CpuPermuteKernel : public ICpuKernel
-{
-public:
-    CpuPermuteKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in]  src  Srouce tensor to permute. Data types supported: All
-     * @param[out] dst  Destination tensor. Data types supported: Same as @p src
-     * @param[in]  perm Permutation vector
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuPermuteKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    PermutationVector _perm{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
deleted file mode 100644
index 27f4b950db..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.cpp
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPool2dKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using namespace misc::shape_calculator;
-
-struct PoolingSelectorData
-{
-    DataType   dt;
-    DataLayout dl;
-    int        pool_stride_x;
-    Size2D     pool_size;
-};
-
-using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
-using PoolingKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-struct PoolingKernel
-{
-    const char              *name;
-    const PoolingSelectorPtr is_selected;
-    PoolingKernelPtr         ukernel;
-};
-
-static const PoolingKernel available_kernels[] =
-{
-    {
-        "neon_qu8_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "neon_qs8_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_f16_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_nhwc_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
-#if defined(ENABLE_NCHW_KERNELS)
-    {
-        "neon_qu8_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_nchw_pool2",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool3",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool7",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_poolMxN",
-        [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    int                 output_width    = 0;
-    int                 output_height   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
-        }
-    }
-
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        BorderSize   &border_size,
-                                                        int pool_size_x, int pool_size_y)
-{
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
-    {
-        // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
-    }
-    const auto          data_layout                  = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    unsigned int        num_elems_read_per_iteration = 0;
-    unsigned int        num_elems_horizontal_window  = 0;
-    int                 pool_stride_x                = 0;
-    int                 pool_stride_y                = 0;
-    const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int           src_width                    = src->dimension(idx_width);
-    const int           src_height                   = src->dimension(idx_height);
-    const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int          pool_pad_right  = pad_stride_info.pad_right();
-    const int          pool_pad_top    = pad_stride_info.pad_top();
-    const int          pool_pad_left   = pad_stride_info.pad_left();
-    const int          pool_pad_bottom = pad_stride_info.pad_bottom();
-    const bool         is_square       = pool_size_x == pool_size_y;
-    const unsigned int pooled_w        = dst->dimension(idx_width);
-    const unsigned int pooled_h        = dst->dimension(idx_height);
-
-    //If it's not squared and optimized will be executed the MxN
-    num_elems_read_per_iteration      = 1;
-    num_elems_processed_per_iteration = 1;
-    num_elems_horizontal_window       = 1;
-
-    if(is_square)
-    {
-        switch(src->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration      = 16;
-                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-                switch(pool_size_x)
-                {
-                    case 2:
-                    case 3:
-                        num_elems_read_per_iteration      = 4;
-                        num_elems_processed_per_iteration = 1;
-                        num_elems_horizontal_window       = 1;
-                        break;
-                    default:
-                        break;
-                }
-                break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-                switch(pool_size_x)
-                {
-                    case 2:
-                        num_elems_read_per_iteration = 2;
-                        break;
-                    case 3:
-                        num_elems_read_per_iteration = 4; // We use vload4 for pooling3
-                        break;
-                    case 7:
-                        num_elems_read_per_iteration = 8; // We use vload8 for pooling7
-                        break;
-                    default:
-                        break;
-                }
-                num_elems_processed_per_iteration = 1;
-                num_elems_horizontal_window       = 1;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Element size not supported");
-                break;
-        }
-    }
-
-    bool   window_changed = false;
-    Window win{};
-    if(data_layout == DataLayout::NCHW)
-    {
-        // Number of iterations in X dimension
-        const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-        // Upper limit for the number of right/bottom border elements that are accessed
-        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-        border_size             = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-        border_size.right       = std::max(upper_bound_w, pool_pad_right);
-        border_size.bottom      = std::max(upper_bound_h, pool_pad_bottom);
-        TensorShape dst_shape{ src->tensor_shape() };
-        dst_shape.set(0, pooled_w);
-        dst_shape.set(1, pooled_h);
-        TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
-        win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowStatic     src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom);
-        AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
-        if(indices)
-        {
-            AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
-            window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, src_access, dst_access);
-        }
-        dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-
-        border_size = src->padding();
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CpuPool2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
-    const bool          is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
-
-    const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size);
-    ARM_COMPUTE_ERROR_ON(uk == nullptr);
-
-    // Set instance variables
-    _pool_info     = pool_info;
-    _data_layout   = src->data_layout();
-    _pool_size     = pool_size;
-    _pool_stride_x = pad_stride_info.stride().first;
-    _run_method    = uk->ukernel;
-    _name          = std::string("CpuPool2dKernel").append("/").append(uk->name);
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        // Configure kernel window
-        Window win = calculate_max_window(*dst, Steps());
-        ICpuKernel::configure(win);
-    }
-    else
-    {
-        // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        _border_size, pool_size.x(), pool_size.y());
-        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        ICpuKernel::configure(win_config.second);
-    }
-}
-
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    BorderSize   border_size(0);
-
-    const bool is_global_pooling = pool_info.is_global_pooling;
-
-    // Get data layout
-    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size,
-                                                              pool_size_x, pool_size_y)
-                                .first);
-
-    return Status{};
-}
-
-void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    ITensor       *dst     = tensors.get_tensor(TensorType::ACL_DST_0);
-    ITensor       *indices = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
-    const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size.width;
-
-    Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
-    {
-        // Set step for src in x and y direction for the src
-        unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
-        {
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-            {
-                window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
-                {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-                }
-                break;
-            }
-
-            case DataType::F16:
-            case DataType::F32:
-            {
-                window_x_inc = pool_stride_x;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Not supported");
-            }
-        }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
-    }
-    else
-    {
-        window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-        window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-    }
-    _run_method(src, dst, indices, _pool_info, window_src, window);
-}
-
-const char *CpuPool2dKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
deleted file mode 100644
index 9ed398b907..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class CpuPool2dKernel : public ICpuKernel
-{
-public:
-    CpuPool2dKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPool2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    BorderSize  border_size() const override;
-    const char *name() const override;
-
-private:
-    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-
-private:
-    PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
-    BorderSize       _border_size{ 0 };
-    Size2D           _pool_size{};
-    int              _pool_stride_x{};
-    PoolingKernelPtr _run_method{ nullptr };
-    std::string      _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.cpp b/src/core/cpu/kernels/CpuQuantizeKernel.cpp
deleted file mode 100644
index 8ca81e8b11..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto window_step = 16;
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-
-template <typename T>
-inline float32x4x4_t load_value(const T *input_ptr)
-{
-    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
-}
-
-template <>
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4x4_t load_value(const float16_t *input_ptr)
-{
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename element_type>
-using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
-
-template <typename quantized_type>
-vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
-
-template <>
-vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize(qv, qi);
-}
-
-template <>
-vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize_signed(qv, qi);
-}
-
-} // namespace
-
-void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
-
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
-
-        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-    };
-
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(src->data_type()) + "_";
-    function_to_call += string_from_data_type(dst->data_type());
-
-    auto it = quant_map.find(function_to_call);
-
-    if(it == quant_map.end())
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
-    }
-    _func = it->second;
-
-    // Configure kernel window
-    Window win_config = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win_config);
-}
-
-Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-template <typename T>
-void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
-}
-
-void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, dst, window);
-}
-
-const char *CpuQuantizeKernel::name() const
-{
-    return "CpuQuantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.h b/src/core/cpu/kernels/CpuQuantizeKernel.h
deleted file mode 100644
index 834a2e03d2..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- */
-class CpuQuantizeKernel : public ICpuKernel
-{
-public:
-    CpuQuantizeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuQuantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    /** Common signature for all the specialised @ref CpuQuantizeKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
-
-    QuantizeFunctionExecutorPtr _func{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.cpp b/src/core/cpu/kernels/CpuReshapeKernel.cpp
deleted file mode 100644
index 5b717b9bba..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-/** [NEReshapeLayerKernel Kernel] **/
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->tensor_shape().total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-
-template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
-{
-    const TensorShape &src_shape = src->info()->tensor_shape();
-    const TensorShape &dst_shape = dst->info()->tensor_shape();
-    Coordinates        dst_coord{};
-
-    Iterator src_it(src, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        dst_coord                                              = index2coords(dst_shape, coords2index(src_shape, id));
-        *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
-    },
-    src_it);
-}
-} // namespace
-
-void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    ARM_COMPUTE_UNUSED(dst);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src);
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
-    return Status{};
-}
-
-void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, src, dst);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, src, dst);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, src, dst);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
-}
-
-const char *CpuReshapeKernel::name() const
-{
-    return "CpuReshapeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-/** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.h b/src/core/cpu/kernels/CpuReshapeKernel.h
deleted file mode 100644
index 1425fbe917..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform tensor reshaping */
-class CpuReshapeKernel : public ICpuKernel
-{
-public:
-    CpuReshapeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Source tensor info. Data type supported: All
-     * @param[out] dst Destination tensor info. Data type supported: Same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref CpuReshapeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
deleted file mode 100644
index 0c1f08ab79..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuScaleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/scale/neon/list.h"
-#include "src/core/cpu/kernels/scale/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ScaleSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using ScaleSelectorPtr = std::add_pointer<bool(const ScaleSelectorData &data)>::type;
-using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                               InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-struct ScaleKernel
-{
-    const char            *name;
-    const ScaleSelectorPtr is_selected;
-    ScaleKernelPtr         ukernel;
-};
-
-static const ScaleKernel available_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
-    },
-    {
-        "sve_fp32_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
-    },
-    {
-        "sve_qu8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
-    },
-    {
-        "sve_qs8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
-    },
-    {
-        "sve_u8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
-    },
-    {
-        "sve_s16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); },
-        REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
-    },
-#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
-    },
-    {
-        "neon_qu8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
-    },
-    {
-        "neon_qs8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
-    },
-    {
-        "neon_u8_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>)
-    },
-    {
-        "neon_s16_scale",
-        [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const ScaleKernel *get_implementation(const ScaleSelectorData &data)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected(data))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_UNUSED(info.constant_border_value);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
-
-    const DataLayout data_layout   = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const auto       height_index  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const auto       output_width  = dst->dimension(width_index);
-    const auto       output_height = dst->dimension(height_index);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
-
-    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-    }
-
-    if(info.interpolation_policy == InterpolationPolicy::BILINEAR)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        if(dx != nullptr && dy != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
-    if(info.interpolation_policy == InterpolationPolicy::AREA)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
-                               ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(dx, dy, offsets);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  dx,
-                                                  dy,
-                                                  offsets,
-                                                  dst,
-                                                  info));
-
-    const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
-
-    // Get data layout and width/height indices
-    _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    _policy                = info.interpolation_policy;
-    _border_mode           = info.border_mode;
-    _constant_border_value = info.constant_border_value;
-    _align_corners         = info.align_corners;
-
-    if(info.sampling_policy == SamplingPolicy::CENTER)
-    {
-        _sampling_offset = 0.5f;
-    }
-
-    // Compute the ratio between source width/height and destination width/height
-    const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
-    const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
-
-    if(_border_mode == BorderMode::UNDEFINED)
-    {
-        _border_mode           = BorderMode::CONSTANT;
-        _constant_border_value = PixelValue();
-    }
-
-#ifdef ENABLE_NCHW_KERNELS
-    // Configure scale function to run
-    if(_data_layout == DataLayout::NCHW)
-    {
-        std::string function_to_call("scale_");
-        function_to_call += string_from_data_type(src->data_type()) + "_";
-        function_to_call += string_from_data_layout(_data_layout) + "_";
-        function_to_call += string_from_interpolation_policy(_policy);
-
-        static std::map<std::string, ScaleFunctionPtr> map_function =
-        {
-            { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
-
-            { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
-            { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
-            { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
-            { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
-            { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
-
-            { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
-            { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
-            { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-            { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
-            { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
-        };
-        auto it = map_function.find(function_to_call);
-        if(it != map_function.end())
-        {
-            _func = it->second;
-        }
-    }
-#endif // ENABLE_NCHW_KERNELS
-
-    // Configure window
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-#ifdef ENABLE_NCHW_KERNELS
-template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy);
-    const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Set offsets window
-    Window win_off;
-    win_off.set(Window::DimX, window[Window::DimX]);
-    win_off.set(Window::DimY, window[Window::DimY]);
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    // Create iterators
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
-        const auto in_yi       = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
-                                                          id.y() + _sampling_offset)
-                                                      * hr));
-        const int32_t offset_row            = in_yi * in_stride_x;
-        *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
-    },
-    src_i, offsets_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, window.x());
-    win_off.set(Window::DimY, window.y());
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-    Iterator offsets_i(offsets, win_off);
-    Iterator dx_i(dx, win_off);
-    Iterator dy_i(dy, win_off);
-
-    const int32_t in_dim_w    = src->info()->dimension(0);
-    const int32_t in_dim_h    = src->info()->dimension(1);
-    const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
-            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
-            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
-            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dx, dy, offsets);
-    using namespace scale_helpers;
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-
-    // Don't increment in width/height/channels for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const auto   wr        = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
-    const auto   hr        = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    const auto   w         = src->info()->dimension(0);
-    const auto   h         = src->info()->dimension(1);
-    const size_t in_stride = src->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
-        vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    src_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
-    // Get data layout and width/height indices
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator src_i(src, win_in);
-    Iterator dst_i(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(_border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-#endif // ENABLE_NCHW_KERNELS
-
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
-    return Status{};
-}
-
-void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC);
-
-    const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst     = tensors.get_tensor(TensorType::ACL_DST);
-    const auto dx      = tensors.get_const_tensor(TensorType::ACL_INT_0);
-    const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
-    const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        (this->*_func)(src, dst, dx, dy, offsets, window);
-    }
-    else
-    {
-        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
-    }
-}
-
-const char *CpuScaleKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h
deleted file mode 100644
index a2b65370ba..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H
-#define ARM_COMPUTE_CPU_SCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
-class CpuScaleKernel : public ICpuKernel
-{
-public:
-    CpuScaleKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in]  src     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in]  dx      Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in]  dy      Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in]  offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
-                   const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuScaleKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
-                           const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-#ifdef ENABLE_NCHW_KERNELS
-    /** function to perform scale using area interpolation on the given window
-     *
-     *  @note Used only in case down-sampling.
-     */
-    void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    template <typename T>
-    void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
-    /** function to perform scale using nearest neighbour on the given window */
-    template <typename T>
-    void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-#endif // ENABLE_NCHW_KERNELS
-
-    /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-
-    ScaleFunctionPtr    _func{ nullptr };
-    InterpolationPolicy _policy{};
-    BorderMode          _border_mode{};
-    PixelValue          _constant_border_value{};
-    float               _sampling_offset{ 0 };
-    bool                _align_corners{ false };
-    DataLayout          _data_layout{ DataLayout::UNKNOWN };
-    ScaleKernelPtr      _run_method{ nullptr };
-    std::string         _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SCALEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
deleted file mode 100644
index c562699092..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/softmax/impl/neon/list.h"
-#include "src/core/cpu/kernels/softmax/impl/sve/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SoftmaxSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using SoftmaxSelectorPtr          = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
-using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-using SoftmaxLogits1DKernelPtr    = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-struct SoftmaxLogits1DKernel
-{
-    const char              *name;
-    const SoftmaxSelectorPtr is_selected;
-    SoftmaxLogits1DKernelPtr ukernel;
-};
-
-struct SoftmaxLogits1DMaxKernel
-{
-    const char                 *name;
-    const SoftmaxSelectorPtr    is_selected;
-    SoftmaxLogits1DMaxKernelPtr ukernel;
-};
-
-static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
-    },
-};
-
-static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
-    },
-    {
-        "sve_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
-    },
-    {
-        "sve_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
-    },
-    {
-        "sve_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
-    },
-    {
-        "neon_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-};
-
-const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_max_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if(output.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
-    }
-
-    return Status{};
-}
-
-} // namespace
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
-    // Softmax across the x dimension
-    const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
-    return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
-    return _name.c_str();
-}
-
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
-                                         const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
-{
-    ARM_COMPUTE_UNUSED(beta);
-    // Check input
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
-    // Check max
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
-    // Check output if configured
-    if(dst.total_size() != 0)
-    {
-        const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
-    }
-
-    // Check tmp if configured
-    if(tmp.total_size() != 0)
-    {
-        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
-        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
-        // We could potentially reduce tmp memory if we could predict or make an assumption
-        // on the maximum number of threads that will run in parallel.
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
-    }
-
-    return Status{};
-}
-} // namespace
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
-    // Configure kernel window
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-
-    // Output auto initialization if not yet initialized
-    const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
-    auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
-
-    // Tmp auto initialization if not yet initialized
-    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
-    auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
-
-    const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
-
-    _beta       = beta;
-    _run_method = uk->ukernel;
-    _name       = kernel_name.append("/").append(uk->name);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*max, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
-                                                  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
-    return Status{};
-}
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
-
-    const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
-
-    ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
-
-    void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-    _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
-}
-
-template <bool IS_LOG>
-const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
-{
-    return _name.c_str();
-}
-
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h
deleted file mode 100644
index 776c0d6f79..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel
-{
-public:
-    CpuLogits1DMaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst Destination tensor info. Data types supported: same as @p input
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuLogits1DMaxKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-
-private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
-    std::string                 _name{};
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel
-{
-public:
-    CpuLogits1DSoftmaxKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max  Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                  Data types supported: same as @p input.
-     * @param[out] dst  Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  beta A scaling factor for the exponent.
-     *
-     * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuLogits1DSoftmaxKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *max,
-                           const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-private:
-    float                    _beta{ 1.0f };
-    SoftmaxLogits1DKernelPtr _run_method{ nullptr };
-    std::string              _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp
deleted file mode 100644
index fa7a55805e..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSubKernel.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/sub/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SubSelectorData
-{
-    DataType dt;
-};
-
-using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type;
-using SubKernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-struct SubKernel
-{
-    const char          *name;
-    const SubSelectorPtr is_selected;
-    SubKernelPtr         ukernel;
-};
-
-static const SubKernel available_kernels[] =
-{
-    {
-        "neon_fp32_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_u8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
-    },
-    {
-        "neon_s16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
-    },
-    {
-        "neon_s32_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
-    },
-    {
-        "neon_qu8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
-    },
-    {
-        "neon_qs8_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_sub",
-        [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
-    },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const SubKernel *get_implementation(DataType dt)
-{
-    for(const auto &uk : available_kernels)
-    {
-        if(uk.is_selected({ dt }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
-
-    const auto *uk = get_implementation(src0.data_type());
-    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP),
-                                    "Convert policy cannot be WRAP if datatype is quantized");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-    return Status{};
-}
-} // namespace
-
-void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape());
-
-    // Auto initialize dst if not initialized
-    set_shape_if_empty(*dst, out_shape);
-    set_data_type_if_unknown(*dst, src0->data_type());
-
-    const auto *uk = get_implementation(src0->data_type());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
-    _policy     = policy;
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuSubKernel").append("/").append(uk->name);
-
-    // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
-    Window win = calculate_max_window(out_shape, Steps());
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
-
-    return Status{};
-}
-
-void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    _run_method(src0, src1, dst, _policy, window);
-}
-
-const char *CpuSubKernel::name() const
-{
-    return _name.c_str();
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h
deleted file mode 100644
index cb64e64cfa..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H
-#define ARM_COMPUTE_CPU_SUB_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform subtraction between two tensors */
-class CpuSubKernel : public ICpuKernel
-{
-public:
-    CpuSubKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel);
-
-    /** Initialise the kernel's src and dst.
-     *
-     * Valid configurations (src0,src1) -> dst :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,S16)                        -> S16
-     *   - (S32,S32)                        -> S32
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in]  src0   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[in]  src1   An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
-     * @param[out] dst    The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
-     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
-     */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuSubKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-private:
-    using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-private:
-    ConvertPolicy _policy{};
-    SubKernelPtr  _run_method{ nullptr };
-    std::string   _name{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.cpp b/src/core/cpu/kernels/CpuTransposeKernel.cpp
deleted file mode 100644
index c7cafe94a8..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-unsigned int num_elems_processed(size_t element_size)
-{
-    switch(element_size)
-    {
-        case 1:
-            return 8;
-        case 2:
-        case 4:
-            return 4;
-        default:
-            break;
-    }
-
-    ARM_COMPUTE_ERROR("Element size not supported");
-}
-
-void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 8;
-    const int    window_step_y            = 8;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
-                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
-                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
-                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
-                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
-                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
-                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
-                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
-                // Transpose 2x2
-                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
-                // Transpose 4x4
-                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
-                // Transpose 8x8
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-            }
-
-            // Compute left-over elements along the x dimension (1x8)
-            for(; x < window_end_x; ++x)
-            {
-                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
-                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
-                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
-                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
-                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
-                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
-                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
-                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
-                uint8x8_t result = vdup_n_u8(0);
-                result           = vset_lane_u8(val0, result, 0);
-                result           = vset_lane_u8(val1, result, 1);
-                result           = vset_lane_u8(val2, result, 2);
-                result           = vset_lane_u8(val3, result, 3);
-                result           = vset_lane_u8(val4, result, 4);
-                result           = vset_lane_u8(val5, result, 5);
-                result           = vset_lane_u8(val6, result, 6);
-                result           = vset_lane_u8(val7, result, 7);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint8_t val0 = *input.ptr();
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
-
-            *(output.ptr() + dst_offset_in_bytes) = val0;
-        },
-        input, output);
-    }
-}
-
-void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
-                // Transpose 4x4
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint16x4_t result = vdup_n_u16(0);
-                result            = vset_lane_u16(val0, result, 0);
-                result            = vset_lane_u16(val1, result, 1);
-                result            = vset_lane_u16(val2, result, 2);
-                result            = vset_lane_u16(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
-
-            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
-    }
-}
-
-void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
-
-    // Check if we need a left-over loop for the y dimension
-    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
-    Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
-    {
-        // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
-        {
-            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
-        }
-        else
-        {
-            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
-        }
-    }
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator output(out, window_out);
-
-    // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
-    {
-        Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Swap block 01 with block 10 and store
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint32x4_t result = vdupq_n_u32(0);
-                result            = vsetq_lane_u32(val0, result, 0);
-                result            = vsetq_lane_u32(val1, result, 1);
-                result            = vsetq_lane_u32(val2, result, 2);
-                result            = vsetq_lane_u32(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
-    }
-
-    if(left_over_loop_y)
-    {
-        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
-        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
-        Iterator input(in, window_in);
-        Iterator output(out, window_out);
-
-        // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
-
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
-
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
-    }
-}
-} // namespace
-
-void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Destination auto inizialitation if not yet initialized
-    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
-
-    // Note: This kernel performs 16 elements per iteration.
-    // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
-    // For this reason num_elems_processed_per_iteration_x is set to 1
-    const unsigned int num_elems_processed_per_iteration_x = 1;
-    const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(dst->num_dimensions());
-    dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
-
-    ICpuKernel::configure(win);
-}
-
-Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    // Error if input is not 8 bit, 16bit or 32bit
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,
-                                    "Element size not supported");
-
-    // Validate configured destination
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    switch(src->info()->element_size())
-    {
-        case 1:
-            transpose_8bit_elements(src, dst, window);
-            break;
-        case 2:
-            transpose_16bit_elements(src, dst, window);
-            break;
-        case 4:
-            transpose_32bit_elements(src, dst, window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-}
-
-const char *CpuTransposeKernel::name() const
-{
-    return "CpuTransposeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.h b/src/core/cpu/kernels/CpuTransposeKernel.h
deleted file mode 100644
index 920349d5e7..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel which transposes the elements of a matrix */
-class CpuTransposeKernel : public ICpuKernel
-{
-public:
-    CpuTransposeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  src Srouce tensor to permute. Data types supported: All
-     * @param[out] dst Destination tensor. Data types supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuTransposeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
deleted file mode 100644
index 79f058944d..0000000000
--- a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
-{
-    TensorShape output_shape{ src->tensor_shape() };
-
-    output_shape.collapse(3);
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
-
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  biases,
-                                                  dst));
-
-    // Configure kernel
-    Window window = calculate_max_window(*src, Steps());
-    window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0)));
-    window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1)));
-    window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2)));
-    ICpuKernel::configure(window);
-}
-
-Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst));
-    return Status{};
-}
-
-void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
-    auto src    = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS);
-    auto dst    = tensors.get_tensor(TensorType::ACL_DST);
-
-    const unsigned int kernel_size_x   = src->info()->dimension(0);
-    const unsigned int kernel_size_y   = src->info()->dimension(1);
-    const unsigned int kernel_depth    = src->info()->dimension(2);
-    const unsigned int input_stride_x  = src->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = src->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = src->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = dst->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(src, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(biases != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
-        }
-    },
-    in);
-}
-const char *CpuWeightsReshapeKernel::name() const
-{
-    return "CpuWeightsReshapeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.h b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
deleted file mode 100644
index eea150a96e..0000000000
--- a/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
-#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CpuWeightsReshapeKernel : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuWeightsReshapeKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  src    The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: All
-     * @param[in]  biases The shared biases tensor info to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] dst    The output tensor info. Data types supported: Same as @p src
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuWeightsReshapeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
deleted file mode 100644
index 9456f96354..0000000000
--- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cpu
-{
-//Batched Gemms
-
-namespace
-{
-inline bool is_kernel_size_supported(DataType data_type, Size2D size)
-{
-    const std::array<Size2D, 8> f32_support = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } };
-    const std::array<Size2D, 8> f16_support = { { Size2D(3, 3) } };
-
-    switch(data_type)
-    {
-        case DataType::F16:
-            return std::end(f16_support) != std::find(std::begin(f16_support), std::end(f16_support), size);
-        case DataType::F32:
-            return std::end(f32_support) != std::find(std::begin(f32_support), std::end(f32_support), size);
-        default:
-            return false;
-    }
-}
-
-Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    const size_t idx_width    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const auto   input_width  = input->dimension(idx_width);
-    const auto   input_height = input->dimension(idx_height);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(input_width, input_height)),
-                                    "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-    const Size2D &output_tile = winograd_info.output_tile_size;
-    const std::array<Size2D, 8> supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } };
-    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile));
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)));
-    const Window win = calculate_max_window(*input, Steps(), true /* skip border*/);
-    return std::make_pair(Status{}, win);
-}
-
-Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const Size2D        &kernel_dims = winograd_info.kernel_size;
-    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-    return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true));
-}
-
-Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
-    const Size2D         kernel_dims = winograd_info.kernel_size;
-
-    // Number of tiles along the X and Y direction
-    const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>
-                                               (winograd_info.output_tile_size.width));
-    const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>
-                                               (winograd_info.output_tile_size.height));
-    const Size2D       num_tiles   = Size2D(num_tiles_x, num_tiles_y);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-
-    const std::array<unsigned int, 3> supported_gemm_sizes = { { 8U, 16U, 36U } };
-    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2)));
-    ARM_COMPUTE_UNUSED(kernel_dims);
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)));
-
-    return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true));
-}
-} // namespace
-
-Status ICpuWinogradConv2dTransformWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *weights)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(weights->dimension(width_idx), weights->dimension(height_idx))),
-                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    return Status{};
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const
-{
-    const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels);
-    // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
-    return static_cast<unsigned int>(WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformWeightsKernel()
-    : _transform(nullptr), _num_output_channels(0), _matrix_stride(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(int num_output_channels, int num_input_channels) const
-{
-    return WinogradConv::get_kernel_matrix_stride(num_input_channels, num_output_channels);
-}
-
-#ifndef DOXYGEN_SKIP_THIS
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensorInfo *weights_hwio,
-    ITensorInfo       *output,
-    const int          matrix_stride,       /** Stride across matrices in the output. */
-    const int          num_output_channels, /** Number of filters. */
-    const int          num_input_channels)  /** Number of channels in each filter. */
-{
-    ARM_COMPUTE_UNUSED(weights_hwio, output);
-
-    _transform           = std::make_unique<WeightsTransform>(num_output_channels, num_input_channels);
-    _num_output_channels = num_output_channels;
-    _matrix_stride       = matrix_stride;
-
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    ICpuKernel::configure(win);
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-
-    const ITensor *weights_hwio = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *output       = tensors.get_tensor(TensorType::ACL_DST);
-
-    _transform->set_weight_tensor(weights_hwio->buffer());
-    const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK);
-    _transform->set_output_matrices(output->buffer(), _matrix_stride, matrix_row_stride);
-    _transform->set_working_space(output->buffer());
-
-    _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
-    return false;
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                                                                                    const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-    return Status{};
-}
-
-template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 3, 3>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 4, 4, 3, 3>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 5, 5>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 6, 1, 3>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 6, 1, 3, 1>;
-
-template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 4, 1, 5>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 4, 1, 5, 1>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 2, 1, 7>;
-template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Input transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
-    int  num_batches,  /* Number of batches in the input tensor. */
-    int  num_channels, /* Number of feature maps in the input tensor. */
-    int  num_rows,     /* Number of rows in each feature map. */
-    int  num_cols,     /* Number of columns in each feature map. */
-    bool same_padding  /* Use "SAME" padding, otherwise use "VALID". */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels);
-    const KernelShape   kern_shape(1, KernelRows, KernelCols, num_channels);
-    // Return the size, converted into units of TIn
-    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
-{
-    return _transform->get_working_space_size(num_threads);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    int  num_batches,  /* Number of batches in the input tensor. */
-    int  num_channels, /* Number of feature maps in the input tensor. */
-    int  num_rows,     /* Number of rows in each feature map. */
-    int  num_cols,     /* Number of columns in each feature map. */
-    bool same_padding /* Use "SAME" padding, otherwise use "VALID". */) const
-{
-    return WinogradConv::get_input_matrix_stride(num_batches, num_rows, num_cols, num_channels, same_padding);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformInputKernel()
-    : _transform(nullptr), _num_channels(0), _matrix_stride(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensorInfo *input_nhwc,
-    const int          num_batches,   /* Number of batches in input tensor. */
-    const int          num_rows,      /* Number of rows in input tensor. */
-    const int          num_cols,      /* Number of columns in input tensor. */
-    const int          num_channels,  /* Number of channels in input tensor. */
-    const PaddingType  padding,       /* Padding type. */
-    ITensorInfo       *output,        /* Base of output matrices. */
-    const int          matrix_stride, /* Stride between output matrices. */
-    ITensorInfo       *workspace)
-{
-    ARM_COMPUTE_UNUSED(input_nhwc, output, matrix_stride, workspace);
-
-    _num_channels  = num_channels;
-    _matrix_stride = matrix_stride;
-
-    const int padding_top    = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0;
-    const int padding_left   = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0;
-    const int padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0;
-    const int padding_right  = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0;
-
-    _transform = std::make_unique<InputTransform>(
-                     KernelRows,
-                     KernelCols,
-                     num_batches,
-                     num_rows,
-                     num_cols,
-                     num_channels,
-                     padding_top,    /**< Padding to apply to the top of the image. */
-                     padding_left,   /**< Padding to apply to the left of the image. */
-                     padding_bottom, /**< Padding to apply to the bottom of the image. */
-                     padding_right   /**< Padding to apply to the right of the image. */
-                 );
-
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    ICpuKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *input_nhwc = tensors.get_const_tensor(TensorType::ACL_SRC);
-    const ITensor *workspace  = tensors.get_const_tensor(TensorType::ACL_INT);
-    ITensor       *output     = tensors.get_tensor(TensorType::ACL_DST);
-
-    const int  element_size_in_bytes = input_nhwc->info()->element_size();
-    const int  input_col_stride      = input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
-    const int  input_row_stride      = input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
-    const int  input_batch_stride    = input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
-    const auto input_nhwc_ptr        = reinterpret_cast<const T *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
-    auto       output_ptr            = reinterpret_cast<T *>(output->buffer() + output->info()->offset_first_element_in_bytes());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr);
-
-    _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride);
-    _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels);
-
-    _transform->set_working_space(workspace->buffer());
-
-    // The code below cannot be moved to configure because biases hasn't been allocated at that point
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst, info.thread_id);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                                                                                  const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-
-    return Status{};
-}
-
-template class CpuWinogradConv2dTransformInputKernel<float, 2, 2, 3, 3>;
-template class CpuWinogradConv2dTransformInputKernel<float, 4, 4, 3, 3>;
-template class CpuWinogradConv2dTransformInputKernel<float, 2, 2, 5, 5>;
-template class CpuWinogradConv2dTransformInputKernel<float, 1, 6, 1, 3>;
-template class CpuWinogradConv2dTransformInputKernel<float, 6, 1, 3, 1>;
-
-template class CpuWinogradConv2dTransformInputKernel<float, 1, 4, 1, 5>;
-template class CpuWinogradConv2dTransformInputKernel<float, 4, 1, 5, 1>;
-template class CpuWinogradConv2dTransformInputKernel<float, 1, 2, 1, 7>;
-template class CpuWinogradConv2dTransformInputKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Output transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
-    int num_batches,        /* Number of batches in the output tensor. */
-    int num_rows,           /* Number of rows in each feature map of the input tensor. */
-    int num_cols,           /* Number of columns in each feature map of the input tensor. */
-    int num_output_channels /* Number of feature maps in the output tensor. */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1);
-    const KernelShape   kern_shape(num_output_channels, KernelRows, KernelCols, 1);
-    // Return the size, converted into units of TOut
-    return static_cast<unsigned int>(
-               WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformOutputKernel()
-    : _transform(nullptr), _matrix_stride(0), _matrix_row_stride(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const
-{
-    return _transform->get_working_space_size(num_threads);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    int num_batches,        /* Number of batches in the output tensor. */
-    int num_rows,           /* Number of rows in each feature map of the input tensor. */
-    int num_cols,           /* Number of columns in each feature map of the input tensor. */
-    int num_output_channels /* Number of feature maps in the output tensor. */
-) const
-{
-    return WinogradConv::get_output_matrix_stride(num_batches, num_rows, num_cols, num_output_channels);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-std::pair<unsigned int, unsigned int> CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
-    int  num_rows, /* Number of rows in each feature map of the input tensor. */
-    int  num_cols, /* Number of columns in each feature map of the input tensor. */
-    bool padding_same) const
-{
-    return WinogradConv::get_output_shape(std::make_pair<unsigned int, unsigned int>(num_rows, num_cols), padding_same);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensorInfo          *biases,
-    const ITensorInfo          *transformed_output,
-    const int                   matrix_stride,
-    ITensorInfo                *output_nhwc,
-    const int                   num_batches,
-    const int                   num_rows,
-    const int                   num_cols,
-    const int                   num_channels,
-    ITensorInfo                *workspace,
-    const arm_gemm::Activation &activation)
-{
-    ARM_COMPUTE_UNUSED(biases, transformed_output, output_nhwc, num_batches, num_rows, num_cols, workspace, activation);
-
-    _matrix_stride     = matrix_stride;
-    _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK);
-
-    // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
-    _transform = std::make_unique<OutputTransform>(num_batches, num_rows, num_cols, num_channels, activation);
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
-    ICpuKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *biases             = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensor *transformed_output = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    ITensor       *workspace          = tensors.get_tensor(TensorType::ACL_INT);
-    ITensor       *dst_nhwc           = tensors.get_tensor(TensorType::ACL_DST);
-
-    const int out_batch_stride = dst_nhwc->info()->strides_in_bytes()[3] / sizeof(T);
-    const int out_row_stride   = dst_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
-    const int out_col_stride   = dst_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
-
-    _transform->set_input_matrices(transformed_output->buffer(), _matrix_stride, _matrix_row_stride);
-    _transform->set_bias((biases ? reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()) : nullptr));
-    _transform->set_output_tensor(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride);
-    _transform->set_working_space(workspace->buffer());
-
-    // The code below cannot be moved to configure because biases hasn't been allocated at that point
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst, info.thread_id);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Status CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                                                                   const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), output->clone().get(), winograd_info).first);
-
-    return Status{};
-}
-
-template class CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 3, 3>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 4, 4, 3, 3>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 5, 5>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 1, 6, 1, 3>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 6, 1, 3, 1>;
-
-template class CpuWinogradConv2dTransformOutputKernel<float, 1, 4, 1, 5>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 4, 1, 5, 1>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 1, 2, 1, 7>;
-template class CpuWinogradConv2dTransformOutputKernel<float, 2, 1, 7, 1>;
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template class CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuWinogradConv2dKernel.h b/src/core/cpu/kernels/CpuWinogradConv2dKernel.h
deleted file mode 100644
index b5a29ffd02..0000000000
--- a/src/core/cpu/kernels/CpuWinogradConv2dKernel.h
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H
-#define ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H
-
-#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
-#include "src/core/cpu/ICpuKernel.h"
-
-#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-
-namespace arm_compute
-{
-namespace cpu
-{
-/** Interface for the kernel to perform Winograd input transform. */
-class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param num_threads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    virtual void configure(const ITensorInfo *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
-                           const PaddingType padding, ITensorInfo *output, const int matrix_stride, ITensorInfo *workspace) = 0;
-
-    /** Destructor */
-    virtual ~ICpuWinogradConv2dTransformInputKernel()
-    {
-    }
-};
-
-/** Kernel to perform Winograd input transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class CpuWinogradConv2dTransformInputKernel : public ICpuWinogradConv2dTransformInputKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformInputKernel(const CpuWinogradConv2dTransformInputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformInputKernel &operator=(const CpuWinogradConv2dTransformInputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformInputKernel(CpuWinogradConv2dTransformInputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = default;
-    /** Default destructor */
-    ~CpuWinogradConv2dTransformInputKernel() = default;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    unsigned int get_input_storage_size(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Default constructor */
-    CpuWinogradConv2dTransformInputKernel();
-
-    const char *name() const override
-    {
-        return "CpuWinogradConv2dTransformInputKernel";
-    }
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor.  Data types supported: F16/F32. Layout supported NHWC.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    void configure(
-        const ITensorInfo *input_nhwc,
-        const int          num_batches,
-        const int          num_rows,
-        const int          num_cols,
-        const int          num_channels,
-        const PaddingType  padding,
-        ITensorInfo       *output,
-        const int          matrix_stride,
-        ITensorInfo       *workspace) override;
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-    using WinogradConv = typename WinogradBase::template Convolution<T, T>;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformInputKernel
-     *
-     * @param[in] input         First tensor input info. Data types supported: F16/F32.
-     * @param[in] output        Output tensor info. Data types supported: same as @p input.
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using InputTransform = typename WinogradBase::template InputTransform<T, T>;
-
-    std::unique_ptr<InputTransform> _transform{ nullptr };
-    int                             _num_channels;  /**< Number of channels in input tensor. */
-    int                             _matrix_stride; /**< Stride between output matrices. */
-};
-
-/** Interface for the kernel to perform Winograd output transform. */
-class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    virtual std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows,    /* Number of rows in each feature map of the input tensor. */
-        int  num_cols,    /* Number of columns in each feature map of the input tensor. */
-        bool padding_same /* True if padding is SAME, false otherwise */
-    ) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    virtual void configure(
-        const ITensorInfo          *biases,
-        const ITensorInfo          *transformed_output,
-        const int                   matrix_stride,
-        ITensorInfo                *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensorInfo                *workspace,
-        const arm_gemm::Activation &activation) = 0;
-
-    virtual ~ICpuWinogradConv2dTransformOutputKernel()
-    {
-    }
-};
-
-/** Kernel to perform Winograd output transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class CpuWinogradConv2dTransformOutputKernel : public ICpuWinogradConv2dTransformOutputKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CpuWinogradConv2dTransformOutputKernel";
-    }
-    /** Constructor */
-    CpuWinogradConv2dTransformOutputKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformOutputKernel(const CpuWinogradConv2dTransformOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformOutputKernel &operator=(const CpuWinogradConv2dTransformOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformOutputKernel(CpuWinogradConv2dTransformOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = default;
-    /** Default destructor */
-    ~CpuWinogradConv2dTransformOutputKernel() = default;
-
-    // Inherited methods overridden:
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows, /* Number of rows in each feature map of the input tensor. */
-        int  num_cols, /* Number of columns in each feature map of the input tensor. */
-        bool padding_same) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor with NHWC data layout, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    void configure(
-        const ITensorInfo          *biases,
-        const ITensorInfo          *transformed_output,
-        const int                   matrix_stride,
-        ITensorInfo                *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensorInfo                *workspace,
-        const arm_gemm::Activation &activation) override;
-
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformOutputKernel
-     *
-     * @param[in] input         Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32.
-     * @param[in] bias          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[in] output        Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using WinogradBase    = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv    = typename WinogradBase::template Convolution<T, T>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<T, T>;
-
-    std::unique_ptr<OutputTransform> _transform{ nullptr };
-    int                              _matrix_stride;
-    int                              _matrix_row_stride;
-};
-
-/** Interface for the kernel to perform Winograd weights transform. */
-class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICpuWinogradConv2dTransformWeightsKernel(const ICpuWinogradConv2dTransformWeightsKernel &) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICpuWinogradConv2dTransformWeightsKernel &operator=(const ICpuWinogradConv2dTransformWeightsKernel &) = default;
-    /** Allow instances of this class to be moved */
-    ICpuWinogradConv2dTransformWeightsKernel(ICpuWinogradConv2dTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICpuWinogradConv2dTransformWeightsKernel &operator=(ICpuWinogradConv2dTransformWeightsKernel &&) = default;
-
-    ICpuWinogradConv2dTransformWeightsKernel()
-    {
-    }
-    virtual ~ICpuWinogradConv2dTransformWeightsKernel()
-    {
-    }
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
-    /** Gets the stride between matrices in the kernel worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0;
-
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor info
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-
-    virtual void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel
-     *
-     * @param[in] input   First tensor input info. Data types supported: F16/F32.
-     * @param[in] weights Weights tensor info. Data types supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
-};
-
-/** Kernel to perform Winograd weights transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class CpuWinogradConv2dTransformWeightsKernel final : public ICpuWinogradConv2dTransformWeightsKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformWeightsKernel(const CpuWinogradConv2dTransformWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CpuWinogradConv2dTransformWeightsKernel &operator=(const CpuWinogradConv2dTransformWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformWeightsKernel(CpuWinogradConv2dTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CpuWinogradConv2dTransformWeightsKernel &operator=(CpuWinogradConv2dTransformWeightsKernel &&) = default;
-    /** Default destructor */
-    ~CpuWinogradConv2dTransformWeightsKernel() = default;
-
-    /** Default constructor. */
-    CpuWinogradConv2dTransformWeightsKernel();
-    const char *name() const override
-    {
-        return "CpuWinogradConv2dTransformWeightsKernel";
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel
-     *
-     * @param[in] input         Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout).
-     *                          kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32.
-     * @param[in] output        Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-
-#ifndef DOXYGEN_SKIP_THIS
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor info
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-    void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
-#endif /* DOXYGEN_SKIP_THIS */
-
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_output_channels, int num_input_channels) const override;
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>;
-
-    std::unique_ptr<WeightsTransform> _transform{ nullptr };
-    int                               _num_output_channels;
-    int                               _matrix_stride;
-};
-
-/** Kernel to perform Winograd. */
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class CpuWinogradConv2dConfiguration
-{
-public:
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-
-    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-
-    using TransformInputKernel   = CpuWinogradConv2dTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformWeightsKernel = CpuWinogradConv2dTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformOutputKernel  = CpuWinogradConv2dTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-};
-
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H*/
diff --git a/src/core/cpu/kernels/activation/list.h b/src/core/cpu/kernels/activation/list.h
deleted file mode 100644
index 409d025db0..0000000000
--- a/src/core/cpu/kernels/activation/list.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ACTIVATION_KERNEL(func_name) \
-    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-
-DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
-
-#undef DECLARE_ACTIVATION_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
diff --git a/src/core/cpu/kernels/activation/neon/fp16.cpp b/src/core/cpu/kernels/activation/neon/fp16.cpp
deleted file mode 100644
index 6f2d5d8533..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
-    auto int_in = vreinterpretq_u16_f16(in);
-    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
-    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
-    constexpr int window_step_x  = 8;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
-    const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-    const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float16_t>(act_info.a());
-    const auto b  = static_cast<float16_t>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
-            float16_t       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/activation/neon/fp32.cpp b/src/core/cpu/kernels/activation/neon/fp32.cpp
deleted file mode 100644
index 54301d45ad..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp32.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
-{
-    auto int_in = vreinterpretq_u32_f32(in);
-    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
-    constexpr int                                 window_step_x  = 4;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
-#endif /* __aarch64__ */
-    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
-    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float>(act_info.a());
-    const auto b  = static_cast<float>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
-            float       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float>(static_cast<float>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max<float>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8.cpp b/src/core/cpu/kernels/activation/neon/qasymm8.cpp
deleted file mode 100644
index a1217435b6..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
-    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
-    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
-    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(0);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
deleted file mode 100644
index 8b40bf8e72..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
-    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
-    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
-    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(1.f);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qsymm16.cpp b/src/core/cpu/kernels/activation/neon/qsymm16.cpp
deleted file mode 100644
index 54b41820f2..0000000000
--- a/src/core/cpu/kernels/activation/neon/qsymm16.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 8;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
-    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
-    const float                   a_f32    = act_info.a();
-    const float                   b_f32    = act_info.b();
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
deleted file mode 100644
index 5e76e82c52..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f16(1.f);
-    const auto const_0     = svdup_n_f16(0.f);
-    const auto const_6     = svdup_n_f16(6.f);
-    const auto const_3     = svdup_n_f16(3.f);
-    const auto const_inv_6 = svdup_n_f16(0.166666667f);
-
-    const auto va = svdup_n_f16(act_info.a());
-    const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        svfloat16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
deleted file mode 100644
index cb9f82eb39..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp32.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/SVEMath.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f32(1.f);
-    const auto const_0     = svdup_n_f32(0.f);
-    const auto const_6     = svdup_n_f32(6.f);
-    const auto const_3     = svdup_n_f32(3.f);
-    const auto const_inv_6 = svdup_n_f32(0.166666667f);
-
-    const auto va = svdup_n_f32(act_info.a());
-    const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        svfloat32_t tmp;
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
-
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8.cpp b/src/core/cpu/kernels/activation/sve/qasymm8.cpp
deleted file mode 100644
index 69fffd96c5..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_u8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        svuint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
deleted file mode 100644
index 53ee515ff9..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_s8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-        svint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svmovlb_s32(svmovlb_s16(vin)),
-                            svmovlt_s32(svmovlb_s16(vin)),
-                            svmovlb_s32(svmovlt_s16(vin)),
-                            svmovlt_s32(svmovlt_s16(vin)),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/sve/qsymm16.cpp b/src/core/cpu/kernels/activation/sve/qsymm16.cpp
deleted file mode 100644
index ac549770a2..0000000000
--- a/src/core/cpu/kernels/activation/sve/qsymm16.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/SVESymm.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = svdup_n_f32(1.f);
-    const auto                    va_f32   = svdup_n_f32(act_info.a());
-    const auto                    vb_f32   = svdup_n_f32(act_info.b());
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        svint16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h
deleted file mode 100644
index 379bd32fb1..0000000000
--- a/src/core/cpu/kernels/add/neon/list.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H
-#define SRC_CORE_NEON_KERNELS_ADD_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_neon);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
-DECLARE_ADD_KERNEL(add_qsymm16_neon);
-
-#undef DECLARE_ADD_KERNEL
-
-template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(ScalarType);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H
diff --git a/src/core/cpu/kernels/add/neon/qasymm8.cpp b/src/core/cpu/kernels/add/neon/qasymm8.cpp
deleted file mode 100644
index e357a7ef7f..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t    broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
-
-            const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
-            const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
-
-            const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x16_t a    = vld1q_u8(non_broadcast_input_ptr + x);
-                const auto       af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto       af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto       af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-                const auto       af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint8x16_t a = vld1q_u8(input1_ptr + x);
-                const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
-                const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
-                const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
deleted file mode 100644
index d62d0739f5..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t    broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const auto  bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int8x16_t a = vld1q_s8(input1_ptr + x);
-                const int8x16_t b = vld1q_s8(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
-                const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-                const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
-                const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
-                const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
-
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
-                rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
-                rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qsymm16.cpp b/src/core/cpu/kernels/add/neon/qsymm16.cpp
deleted file mode 100644
index e76e408d6e..0000000000
--- a/src/core/cpu/kernels/add/neon/qsymm16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
-                const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-#ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp
deleted file mode 100644
index cf9e301c29..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h
deleted file mode 100644
index 32ff5d0496..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
deleted file mode 100644
index 4d29c2a8f1..0000000000
--- a/src/core/cpu/kernels/add/sve/list.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
-#define SRC_CORE_SVE_KERNELS_ADD_LIST_H
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_sve);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_sve);
-DECLARE_ADD_KERNEL(add_qsymm16_sve);
-
-#undef DECLARE_ADD_KERNEL
-
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8.cpp b/src/core/cpu/kernels/add/sve/qasymm8.cpp
deleted file mode 100644
index 888ad878ca..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const auto all_true_pg           = svptrue_b8();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
-    const auto voffseto   = svdup_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        const svfloat32_t vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
-        const svfloat32_t vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
-        const svint32_t   voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
-        const svint32_t   voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-
-            do
-            {
-                const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const auto vscale1  = svdup_n_f32(iq1_info.scale);
-        const auto vscale2  = svdup_n_f32(iq2_info.scale);
-        const auto voffset1 = svdup_n_s32(iq1_info.offset);
-        const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
-            {
-                const auto a    = svld1_u8(pg, input1_ptr + x);
-                const auto b    = svld1_u8(pg, input2_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
deleted file mode 100644
index 3b922c6c21..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
-    const auto voffseto   = svdup_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const auto     all_true_pg          = svptrue_b8();
-
-        const auto vscale1  = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
-        const auto vscale2  = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
-        const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
-        const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
-
-            int        x    = window_start_x;
-            svbool_t   pg   = svwhilelt_b8(x, window_end_x);
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
-            do
-            {
-                const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        const auto vscale1  = svdup_n_f32(iq1_info.scale);
-        const auto vscale2  = svdup_n_f32(iq2_info.scale);
-        const auto voffset1 = svdup_n_s32(iq1_info.offset);
-        const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
-            {
-                const auto a = svld1_s8(pg, input1_ptr + x);
-                const auto b = svld1_s8(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(svptrue_b8(), pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qsymm16.cpp b/src/core/cpu/kernels/add/sve/qsymm16.cpp
deleted file mode 100644
index eef5d245d3..0000000000
--- a/src/core/cpu/kernels/add/sve/qsymm16.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const auto vscale1     = svdup_n_f32(iq1_info.scale);
-    const auto vscale2     = svdup_n_f32(iq2_info.scale);
-    const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
-    const auto all_true_pg = svptrue_b16();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
-
-            do
-            {
-                const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-            do
-            {
-                auto a = svld1_s16(pg, input1_ptr + x);
-                auto b = svld1_s16(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
deleted file mode 100644
index 4b7b092d01..0000000000
--- a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
-
-#include "gemm_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-namespace cpu
-{
-namespace kernel
-{
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
-  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
-  * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
-  * in the context of an NEFunctions.
-  *
-  * The type T is the type of the actual kernel implemented in assembly which is of type
-  *         template<typename To, typename Tr> class GemmCommon
-  *
-  *
-  */
-template <typename TypeInput, typename TypeOutput>
-class CpuGemmAssemblyWrapperKernel final : public INEKernel
-{
-public:
-    /** Constructor
-     */
-    CpuGemmAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
-    {
-    }
-
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
-    CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return _name.c_str();
-    }
-
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        auto win = arm_gemm::to_ndcoord(window);
-
-        arm_gemm::ndcoord_t thread_locator{};
-
-        _kernel->execute(win, thread_locator, info.thread_id);
-    }
-
-    // Inherited methods overridden:
-    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        //convert between arm_compute and arm_gemm types
-        auto ndc_win = arm_gemm::to_ndcoord(window);
-        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
-
-        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
-    }
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel          Pointer to an assembly kernel implementation.
-     * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
-     */
-    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-
-        Window win = to_window(kernel->get_window_size());
-
-        INEKernel::configure(win);
-
-        if(!kernel_name_tag.empty())
-        {
-            _name += "/" + kernel_name_tag;
-        }
-    }
-
-private:
-    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
-};
-} // namespace kernel
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index e38cc09202..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm
-{
-enum class GemmMethod
-{
-    DEFAULT,
-    GEMV_BATCHED,
-    GEMV_PRETRANSPOSED,
-    GEMV_NATIVE_TRANSPOSED,
-    GEMM_NATIVE,
-    GEMM_HYBRID,
-    GEMM_INTERLEAVED,
-    GEMM_INTERLEAVED_2D,
-    QUANTIZE_WRAPPER,
-    QUANTIZE_WRAPPER_2D,
-    GEMM_HYBRID_QUANTIZED
-};
-
-struct KernelDescription
-{
-    GemmMethod  method         = GemmMethod::DEFAULT;
-    std::string name           = "";
-    bool        is_default     = false;
-    uint64_t    cycle_estimate = 0;
-
-    KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
-        : method(m), name(n), is_default(d), cycle_estimate(c)
-    {
-    }
-    KernelDescription() noexcept
-    {
-    }
-};
-
-struct GemmConfig
-{
-    GemmMethod   method           = GemmMethod::DEFAULT;
-    std::string  filter           = "";
-    unsigned int inner_block_size = 0;
-    unsigned int outer_block_size = 0;
-
-    GemmConfig(GemmMethod method)
-        : method(method)
-    {
-    }
-    GemmConfig()
-    {
-    }
-};
-
-struct Activation
-{
-    enum class Type
-    {
-        None,
-        ReLU,
-        BoundedReLU
-    };
-
-    Type  type;
-    float param1;
-    float param2;
-
-    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
-        : type(type), param1(p1), param2(p2)
-    {
-    }
-};
-
-struct GemmArgs
-{
-public:
-    const CPUInfo    *_ci;
-    unsigned int      _Msize;
-    unsigned int      _Nsize;
-    unsigned int      _Ksize;
-    unsigned int      _Ksections;
-    unsigned int      _nbatches;
-    unsigned int      _nmulti;
-    bool              _indirect_input;
-    Activation        _act;
-    int               _maxthreads;
-    bool              _fast_mode;
-    const GemmConfig *_cfg;
-
-    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
-             unsigned int K, unsigned int Ksections, unsigned int nbatches,
-             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             bool fast_mode = false, const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _fast_mode(fast_mode),
-          _cfg(cfg)
-    {
-    }
-};
-
-struct Requantize32
-{
-public:
-    const int32_t *bias                     = nullptr;
-    size_t         bias_multi_stride        = 0;
-    int32_t        a_offset                 = 0;
-    int32_t        b_offset                 = 0;
-    int32_t        c_offset                 = 0;
-    bool           per_channel_requant      = false;
-    int32_t        per_layer_left_shift     = 0;
-    int32_t        per_layer_right_shift    = 0;
-    int32_t        per_layer_mul            = 0;
-    const int32_t *per_channel_left_shifts  = nullptr;
-    const int32_t *per_channel_right_shifts = nullptr;
-    const int32_t *per_channel_muls         = nullptr;
-    int32_t        minval                   = 0;
-    int32_t        maxval                   = 0;
-
-    Requantize32() = default;
-
-    // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
-          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
-    {
-    }
-
-    // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_left_shifts,
-                 const int32_t *requant_right_shifts,
-                 const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
-          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
-    {
-    }
-};
-
-struct Nothing
-{
-};
-
-template <typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM?  */
-template <typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
deleted file mode 100644
index 718fcd1fb4..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/Window.h"
-
-#include "ndrange.hpp"
-
-#include <cassert>
-
-/* This file contains mapping between integral types used in arm_compute and arm_gemm
- * These two codebases both require a degree of separation for the sake of modularity
- * so maintain their own types which represent similar information.
- */
-
-namespace arm_gemm
-{
-//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
-
-using ndrange_t = NDRange<ndrange_max>;
-using ndcoord_t = NDCoordinate<ndrange_max>;
-
-/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
- *
- * As `NDRange<T>` does not not encode start positions, we specify
- * the start to be zero in the produced `arm_compute::Window`
- *
- * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
- */
-inline arm_compute::Window to_window(const ndrange_t &ndr)
-{
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i != ndrange_max; ++i)
-    {
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
-    }
-
-    return win;
-}
-
-/*
- * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
- *
- * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
- */
-inline arm_compute::Window to_window(const ndcoord_t &ndc)
-{
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i != ndrange_max; ++i)
-    {
-        const auto start = ndc.get_position(i);
-        const auto size  = ndc.get_size(i);
-        const auto stop  = start + size;
-
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(start, stop));
-    }
-
-    return win;
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
- *
- * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
- * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
- * @return the resultant ndrange_t
- */
-inline ndrange_t to_ndrange(const arm_compute::Window &win)
-{
-    return
-    {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
- * @return the resultant ndcoord_t
- */
-inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
-{
-    return
-    {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
-}
-
-} //namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
deleted file mode 100644
index 78e0adf31f..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-using CPUModel = arm_compute::CPUModel;
-using CPUInfo  = arm_compute::CPUInfo;
diff --git a/src/core/cpu/kernels/assembly/convolution_parameters.hpp b/src/core/cpu/kernels/assembly/convolution_parameters.hpp
deleted file mode 100644
index 0c1ae58902..0000000000
--- a/src/core/cpu/kernels/assembly/convolution_parameters.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstdint>
-
-namespace arm_gemm
-{
-/*
- * Parameter set for "convolution" type GEMM.
- *
- * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
- * an im2row had been performed on the input tensor to generate the operand
- * matrix, but instead this structure describes the convolution parameters
- * such that this can be done on the fly.
- *
- * The parameters describe the convolution details - the notional shape of
- * the input and output tensors, whether padding is to be applied, the size
- * of the kernel and a constant value to be used for padding (needed for
- * quantized tensors).
- *
- * The second part describes the layout of the input tensor in memory, which
- * is assumed to be in NHWC format.  This consists of a base pointer and
- * strides for columns, rows and batches.  'multis' are not supported for
- * convolution type GEMMs.
- */
-struct ConvolutionParameters
-{
-    int64_t input_width;
-    int64_t input_height;
-    int64_t input_channels;
-    int64_t kernel_width;
-    int64_t kernel_height;
-    int64_t output_width;
-    int64_t output_height;
-    int64_t output_stride_w;
-    int64_t output_stride_h;
-    //          output_channels not included as they do not affect the input.
-    int64_t padding_top;
-    int64_t padding_left;
-    float   padding_value;
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp
deleted file mode 100644
index 378f1041be..0000000000
--- a/src/core/cpu/kernels/assembly/gemm_common.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "convolution_parameters.hpp"
-#include "ndrange.hpp"
-
-#include <cstddef>
-
-namespace arm_gemm
-{
-// Avoid circular dependency with arm_gemm.hpp
-struct GemmConfig;
-
-// Abstract class for the GEMM/GEMV functions.
-//
-// GEMM implementations may be "native" (never require any input
-// permutation), "pretransposed" (require permutation up-front) or require
-// working space (permute as they go along).  This interface should support
-// all of them.
-
-// The real GemmCommon class is templated based on the operand and return
-// type.  This is an interface class which is independent of those types.
-class IGemmCommon
-{
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides.  This "generic" version uses void *s, the preferred version
-     * is the one provided by templated GemmCommon (below) which takes
-     * appropriately typed pointers.  If B is pretransposed (see below) then
-     * the settings for B here are ignored.
-     */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
-
-    /** @returns an ndrange containing ranges of the compute space which can be
-     * broken up and parallelised over
-     */
-    virtual ndrange_t get_window_size() const = 0;
-
-    /* The maximum thread count is specified when the GEMM is created.  Some
-     * implementations need to know how many threads will actually run in
-     * order to work properly.
-     *
-     * In some cases, after creating the GEMM the number of threads needs to
-     * be reduced (e.g. not enough work to split across threads).  This
-     * method allows the number of actual threads to be run to be set (must
-     * be equal or lower).
-     *
-     * This has an empty default implementation, as GEMMs which don't care
-     * about thread count can safely ignore this.
-     */
-    virtual void set_nthreads(int) {};
-
-    /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const
-    {
-        return false;
-    }
-
-    /** Main execute member fucntion
-     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
-     * @param [in] thread_locator where are we inside of the thread space
-     * @param [in] threadid       a unique threadid
-     */
-    virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
-
-    /*** Working space interface (optional) ***/
-    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const
-    {
-        return 0;
-    }
-    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) {};
-
-    /*** "Pretransposed" interface (optional) ***/
-    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const
-    {
-        return false;
-    }
-    /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const
-    {
-        return false;
-    }
-    /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const
-    {
-        return 0;
-    }
-    /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
-    /* The "real" version of this depends on the templated operand type (see below).  */
-    virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
-    /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *)
-    {
-    }
-
-    /*** "Quantized bias" interface (optional) ***/
-    /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *, size_t)
-    {
-    }
-
-    /*** Indirect interface (optional) ***/
-    /* Set the indirect table.  This comprises a number of values per kernel point, and a densely packed array of pointers,
-     * multis * batches * kernel_points */
-    virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
-    {
-    }
-
-    /*** Convolution interface (optional) ***/
-    /* Set the convolution parameters. */
-    virtual void set_convolution_parameters(ConvolutionParameters)
-    {
-    }
-
-    /*** Introspection interface ***/
-    /* Get the configuration of this GEMM */
-    virtual GemmConfig get_config() = 0;
-
-    // Destructor
-    virtual ~IGemmCommon()
-    {
-    }
-};
-
-/* "Real" GemmCommon class which is templated on the operand and return types.
- *
- * In addition to correctly typed versions of the functions that operate on
- * operand and return data, this class provides a default implementation of
- * 'set_arrays' to capture the provided arguments in protected class
- * members, as essentially any implementation will need these.
- */
-template <typename To, typename Tr>
-class GemmCommon : public IGemmCommon
-{
-protected:
-    const To *_Aptr              = nullptr;
-    int       _lda               = 0;
-    int       _A_batch_stride    = 0;
-    int       _A_multi_stride    = 0;
-    const To *_Bptr              = nullptr;
-    int       _ldb               = 0;
-    int       _B_multi_stride    = 0;
-    Tr       *_Cptr              = nullptr;
-    int       _ldc               = 0;
-    int       _C_batch_stride    = 0;
-    int       _C_multi_stride    = 0;
-    const Tr *_bias              = nullptr;
-    int       _bias_multi_stride = 0;
-
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
-    {
-        _Aptr              = A;
-        _lda               = lda;
-        _A_batch_stride    = A_batch_stride;
-        _A_multi_stride    = A_multi_stride;
-        _Bptr              = B;
-        _ldb               = ldb;
-        _B_multi_stride    = B_multi_stride;
-        _Cptr              = C;
-        _ldc               = ldc;
-        _C_batch_stride    = C_batch_stride;
-        _C_multi_stride    = C_multi_stride;
-        _bias              = bias;
-        _bias_multi_stride = bias_multi_stride;
-    }
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
-    {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
-                   static_cast<const Tr *>(bias), bias_multi_stride);
-    }
-
-    /*** "Pretransposed" interface ***/
-
-    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
-    /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
-    {
-        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
-    }
-
-    /*** Indirect interface ***/
-    virtual void set_indirect_parameters(size_t, const To *const *const *)
-    {
-    }
-
-    void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
-    {
-        set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
-    }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/ndrange.hpp b/src/core/cpu/kernels/assembly/ndrange.hpp
deleted file mode 100644
index 1c8261aef7..0000000000
--- a/src/core/cpu/kernels/assembly/ndrange.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <initializer_list>
-
-namespace arm_gemm
-{
-template <unsigned int D>
-class NDRange
-{
-private:
-    std::array<unsigned int, D> m_sizes{};
-    std::array<unsigned int, D> m_totalsizes{};
-
-    class NDRangeIterator
-    {
-    private:
-        const NDRange &m_parent;
-        unsigned int   m_pos = 0;
-        unsigned int   m_end = 0;
-
-    public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
-            : m_parent(p), m_pos(s), m_end(e)
-        {
-        }
-
-        bool done() const
-        {
-            return (m_pos >= m_end);
-        }
-
-        unsigned int dim(unsigned int d) const
-        {
-            unsigned int r = m_pos;
-
-            if(d < (D - 1))
-            {
-                r %= m_parent.m_totalsizes[d];
-            }
-
-            if(d > 0)
-            {
-                r /= m_parent.m_totalsizes[d - 1];
-            }
-
-            return r;
-        }
-
-        bool next_dim0()
-        {
-            m_pos++;
-
-            return !done();
-        }
-
-        bool next_dim1()
-        {
-            m_pos += m_parent.m_sizes[0] - dim(0);
-
-            return !done();
-        }
-
-        unsigned int dim0_max() const
-        {
-            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
-            return dim(0) + offset;
-        }
-    };
-
-    void set_totalsizes()
-    {
-        unsigned int t = 1;
-
-        for(unsigned int i = 0; i < D; i++)
-        {
-            if(m_sizes[i] == 0)
-            {
-                m_sizes[i] = 1;
-            }
-
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-public:
-    NDRange &operator=(const NDRange &rhs) = default;
-    NDRange(const NDRange &rhs)            = default;
-
-    template <typename... T>
-    NDRange(T... ts)
-        : m_sizes{ ts... }
-    {
-        set_totalsizes();
-    }
-
-    NDRange(const std::array<unsigned int, D> &n)
-        : m_sizes(n)
-    {
-        set_totalsizes();
-    }
-
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const
-    {
-        return NDRangeIterator(*this, start, end);
-    }
-
-    unsigned int total_size() const
-    {
-        return m_totalsizes[D - 1];
-    }
-
-    unsigned int get_size(unsigned int v) const
-    {
-        return m_sizes[v];
-    }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template <unsigned int N>
-class NDCoordinate : public NDRange<N>
-{
-    using int_t     = unsigned int;
-    using ndrange_t = NDRange<N>;
-
-    std::array<int_t, N> m_positions{};
-
-public:
-    NDCoordinate &operator=(const NDCoordinate &rhs) = default;
-    NDCoordinate(const NDCoordinate &rhs)            = default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
-    {
-        std::array<int_t, N> sizes{};
-
-        std::size_t i = 0;
-        for(auto &p : list)
-        {
-            m_positions[i] = p.first;
-            sizes[i++]     = p.second;
-        }
-
-        //update the parents sizes
-        static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
-    }
-
-    int_t get_position(int_t d) const
-    {
-        assert(d < N);
-
-        return m_positions[d];
-    }
-
-    void set_position(int_t d, int_t v)
-    {
-        assert(d < N);
-
-        m_positions[d] = v;
-    }
-
-    int_t get_position_end(int_t d) const
-    {
-        return get_position(d) + ndrange_t::get_size(d);
-    }
-}; //class NDCoordinate
-
-using ndrange_t = NDRange<6>;
-using ndcoord_t = NDCoordinate<6>;
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
deleted file mode 100644
index 43e44be5e2..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
-            {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-    auto res = ScalarType(0);
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = std::max(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = std::min(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            res = (a - b) * (a - b);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            res = (a > 0 ? a : a * b);
-            break;
-        }
-        case ArithmeticOperation::DIV:
-        {
-            res = a / b;
-            if(std::is_integral<ScalarType>::value)
-            {
-                res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
-                {
-                    --res;
-                }
-            }
-            break;
-        }
-        case ArithmeticOperation::POWER:
-        {
-            res = std::pow(a, b);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res;
-}
-
-template <ArithmeticOperation    op, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
-{
-    using vec_type    = typename VectorType::type;
-    using scalar_type = typename VectorType::scalar_type;
-    using tag_type    = typename VectorType::tag_type;
-
-    vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = wrapper::vmax(a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = wrapper::vmin(a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            const vec_type tmp = wrapper::vsub(a, b);
-            res                = wrapper::vmul(tmp, tmp);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-            const vec_type tmp  = wrapper::vmul(a, b);
-            const auto     gt   = wrapper::vcgt(a, zero);
-
-            res = wrapper::vbsl(gt, a, tmp);
-            break;
-        }
-
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
-{
-    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
-    return wrapper::vpow(a, b);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <ArithmeticOperation    op, typename ScalarType, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
-{
-    using tag_type = typename VectorType::tag_type;
-    using vec_type = typename VectorType::type;
-
-    vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
-    return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq(input1_ptr + x);
-        const auto b = wrapper::vloadq(input2_ptr + x);
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
-    }
-    return x;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using scalar_type = typename VectorType::scalar_type;
-
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType>
-inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
-{
-    bool res = false;
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = (a == b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = (a != b);
-            break;
-        case ComparisonOperation::Greater:
-            res = (a > b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = (a >= b);
-            break;
-        case ComparisonOperation::Less:
-            res = (a < b);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = (a <= b);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-    return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
-}
-
-template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
-{
-    OutputVectorType res = { 0, 0, 0, 0 };
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            res = wrapper::vceq(a, b);
-            break;
-        case ComparisonOperation::NotEqual:
-            res = wrapper::vnot(wrapper::vceq(a, b));
-            break;
-        case ComparisonOperation::Greater:
-            res = wrapper::vcgt(a, b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            res = wrapper::vcge(a, b);
-            break;
-        case ComparisonOperation::Less:
-            res = wrapper::vcgt(b, a);
-            break;
-        case ComparisonOperation::LessEqual:
-            res = wrapper::vcge(b, a);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
-{
-    InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, a);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(a, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
-        wrapper::vstore(output_ptr + x, res);
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        auto       a    = wrapper::vloadq(input1_ptr + x);
-        auto       b    = wrapper::vloadq(input2_ptr + x);
-        const auto res  = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        a               = wrapper::vloadq(input1_ptr + x + 4);
-        b               = wrapper::vloadq(input2_ptr + x + 4);
-        const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
-    }
-    if(x <= window_end_x - 4)
-    {
-        const auto a   = wrapper::vloadq(input1_ptr + x);
-        const auto b   = wrapper::vloadq(input2_ptr + x);
-        const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
-        {
-            *(output_ptr + x + i) = wrapper::vgetlane(res, i);
-        }
-        x = +4;
-    }
-    return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
-}
-} // namesapce cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
deleted file mode 100644
index 1ff4632f5c..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
+++ /dev/null
@@ -1,654 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
-    qasymm8x16_t        x = vld1q_u8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-        }
-    };
-    return out;
-}
-
-float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
-    qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-        }
-    };
-    return out;
-}
-
-void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
-{
-    const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
-    const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
-    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
-{
-    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
-    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
-    vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
-    store_quantized(output_ptr, out);
-}
-
-void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
-{
-    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
-    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
-    vst1q_s8(output_ptr, vcombine_s8(pa, pb));
-}
-
-void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
-    store_quantized_signed(output_ptr, out);
-}
-
-template <ArithmeticOperation op>
-inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-    using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
-    float32x4x4_t out =
-    {
-        {
-            elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
-        }
-    };
-    return out;
-}
-
-template <ComparisonOperation op>
-inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    ARM_COMPUTE_UNUSED(qinfo);
-    return elementwise_comp_op_scalar<op>(a, b);
-}
-
-template <ComparisonOperation op>
-inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-    uint32x4x4_t out =
-    {
-        {
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
-        }
-    };
-    return out;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        // Get inputs and compute output
-        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                       const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
-                                                       int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                       float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        // Get inputs and compute output
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
-                                                                 int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                                 float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                              float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                     const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                               int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                               float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                              int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                    float32x4_t, float32x4_t, const bool),
-                              int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                               float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                       uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                       int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t, const bool),
-                                       int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                        int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                        float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                     int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if(is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
-                             &elementwise_arithm_op_quantized_broadcast_loop<op>,
-                             &elementwise_arithm_op_quantized_loop<op>);
-}
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
-                                    &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
-                                    &elementwise_arithm_op_quantized_singed_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                             &elementwise_comp_op_quantized_broadcast_loop<op>,
-                             &elementwise_comp_op_quantized_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                                      &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
-                                      &elementwise_comp_op_quantized_signed_loop<op>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
deleted file mode 100644
index 307e95fae9..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return 1 / sqrt(a);
-        case ElementWiseUnary::EXP:
-            return std::exp(a);
-        case ElementWiseUnary::NEG:
-            return -a;
-        case ElementWiseUnary::LOG:
-            return std::log(a);
-        case ElementWiseUnary::ABS:
-            return std::abs(a);
-        case ElementWiseUnary::ROUND:
-            return support::cpp11::nearbyint(a);
-        case ElementWiseUnary::SIN:
-            return std::sin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return wrapper::vinvsqrt(a);
-        case ElementWiseUnary::EXP:
-            return wrapper::vexpq(a);
-        case ElementWiseUnary::NEG:
-            return wrapper::vneg(a);
-        case ElementWiseUnary::LOG:
-            return wrapper::vlog(a);
-        case ElementWiseUnary::ABS:
-            return wrapper::vabs(a);
-        case ElementWiseUnary::ROUND:
-            return wrapper::vround(a);
-        case ElementWiseUnary::SIN:
-            return wrapper::vsin(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const int  window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
-        }
-    },
-    input, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
deleted file mode 100644
index 58ebb28fe5..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct LoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    InputScalarType        broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-};
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = svld1(pg, args.input1_ptr);
-    const auto in2 = svld1(pg, args.input2_ptr);
-    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     in1       = svld1(pg, args.input1_ptr);
-    const auto     in2       = svld1(pg, args.input2_ptr);
-    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OperatorType op,
-                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                broadcast_func(pg,
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    broadcast_value,
-                    output_ptr + x,
-                    !is_broadcast_input_2
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                func(pg,
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename sve_vector<ScalarType>::type;
-
-    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
-                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
-
-    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
-                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
-}
-
-template <>
-svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
-{
-    ARM_COMPUTE_UNUSED(pg, a, b);
-    ARM_COMPUTE_ERROR("Not supported");
-}
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
deleted file mode 100644
index fea38d2995..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename VectorType>
-VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
-    return svpow_z(pg, a, b);
-}
-
-template <typename VectorType>
-VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
-    return svdiv_z(pg, a, b);
-}
-
-template <uint32_t bytewidth>
-svbool_t narrow_to_byte_predicate(svbool_t pg)
-{
-    const auto all_false = svpfalse();
-
-    switch(bytewidth)
-    {
-        case 8:
-            pg = svuzp1_b32(pg, all_false);
-        /* fall through */
-        case 4:
-            pg = svuzp1_b16(pg, all_false);
-        /* fall through */
-        case 2:
-            pg = svuzp1_b8(pg, all_false);
-        /* fall through */
-        default:
-            break;
-    }
-    return pg;
-}
-
-template <typename VectorType>
-VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
-{
-    using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
-    VectorType res{};
-
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            res = svmax_z(pg, a, b);
-            break;
-        case ArithmeticOperation::MIN:
-            res = svmin_z(pg, a, b);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-        {
-            const auto tmp = svsub_z(pg, a, b);
-            res            = svmul_z(pg, tmp, tmp);
-            break;
-        }
-        case ArithmeticOperation::PRELU:
-        {
-            const auto zero = svdup_n(ScalarType(0));
-            const auto tmp  = svmul_z(pg, a, b);
-            const auto gt   = svcmpgt(pg, a, zero);
-            res             = svsel(gt, a, tmp);
-            break;
-        }
-        case ArithmeticOperation::DIV:
-        {
-            res = elementwise_div(pg, a, b);
-            break;
-        }
-        case ArithmeticOperation::POWER:
-        {
-            res = elementwise_pow(pg, a, b);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    return res;
-}
-
-template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
-{
-    svbool_t selection_vector{};
-
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            selection_vector = svcmpeq(pg, a, b);
-            break;
-        case ComparisonOperation::NotEqual:
-            selection_vector = svcmpne(pg, a, b);
-            break;
-        case ComparisonOperation::Greater:
-            selection_vector = svcmpgt(pg, a, b);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            selection_vector = svcmpge(pg, a, b);
-            break;
-        case ComparisonOperation::Less:
-            selection_vector = svcmplt(pg, a, b);
-            break;
-        case ComparisonOperation::LessEqual:
-            selection_vector = svcmple(pg, a, b);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-
-    using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
-    selection_vector      = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
-
-    using OutputScalarType  = typename wrapper::sve_scalar<OutputVectorType>::type;
-    const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
-    const auto true_vector  = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
-    auto       ret          = svsel(selection_vector, true_vector, false_vector);
-
-    return ret;
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
deleted file mode 100644
index 5e04128b44..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct QuantizedLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-
-    const svint32_t   &in1_offset;
-    const svint32_t   &in2_offset;
-    const svint32_t   &out_offset;
-    const svfloat32_t &in1_scale;
-    const svfloat32_t &in2_scale;
-    const svfloat32_t &out_scale;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastQuantizedLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    float                  broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-
-    const svint32_t   &in1_offset;
-    const svint32_t   &out_offset;
-    const svfloat32_t &in1_scale;
-    const svfloat32_t &out_scale;
-};
-
-svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
-    auto x = svld1(pg, ptr);
-
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
-
-    pg = svptrue_b8();
-
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
-}
-
-svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
-    auto x = svld1(pg, ptr);
-
-    //vprint(x);
-
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
-
-    pg = svptrue_b8();
-
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
-}
-
-void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
-    const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
-    const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
-    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
-    svst1(pg, ptr, narrowed);
-}
-
-void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
-    const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
-    const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
-    const auto narrowed        = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
-
-    svst1(pg, ptr, narrowed);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
-    const auto result = svcreate4(
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
-    store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = svcreate4(
-                         svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
-    const auto &af = args.reorder ? in2 : in1;
-    const auto &bf = args.reorder ? in1 : in2;
-
-    const auto result = svcreate4(
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op),
-                            elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
-    store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
-    const auto result = svcreate4(
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
-    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-    const auto zipped        = svzip1(zipped_bottom, zipped_top);
-    svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
-    const auto in2 = svcreate4(
-                         svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
-    const auto &af = args.reorder ? in2 : in1;
-    const auto &bf = args.reorder ? in1 : in2;
-
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
-    const auto result = svcreate4(
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op),
-                            elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
-    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-    const auto zipped        = svzip1(zipped_bottom, zipped_top);
-    svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename wrapper::sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
-void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                              OperatorType op,
-                              LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                              BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = wrapper::svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
-    const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
-
-        const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
-        const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo),
-                    output_ptr + x,
-                    !is_broadcast_input_2,
-                    non_broadcast_voffset, output_voffset,
-                    non_broadcast_vscale, output_vscale
-                };
-                broadcast_func(pg, args);
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
-        const auto in1_vscale  = svdup_n(in1->info()->quantization_info().uniform().scale);
-
-        const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
-        const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x,
-                    in1_voffset, in2_voffset, output_voffset,
-                    in1_vscale, in2_vscale, output_vscale
-                };
-                func(pg, args);
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
-    elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                          &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
-                                                                          &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename wrapper::traits::sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-    elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                                     &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
-                                                                                     &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
deleted file mode 100644
index ddf1febd66..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return svinvsqrt(pg, a);
-        case ElementWiseUnary::EXP:
-            return wrapper::svexp_z(pg, a);
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::LOG:
-            return wrapper::svlog_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        case ElementWiseUnary::ROUND:
-            return svrintn_z(pg, a);
-        case ElementWiseUnary::SIN:
-            return wrapper::svsin_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
-}
-
-template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
deleted file mode 100644
index c2b495f27c..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/floor/list.h b/src/core/cpu/kernels/floor/list.h
deleted file mode 100644
index 4367e0ffc9..0000000000
--- a/src/core/cpu/kernels/floor/list.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_FLOOR_KERNEL(func_name) \
-    void func_name(const void *src, void *dst, int len)
-
-DECLARE_FLOOR_KERNEL(fp16_neon_floor);
-DECLARE_FLOOR_KERNEL(fp32_neon_floor);
-
-#undef DECLARE_FLOOR_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */
diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp
deleted file mode 100644
index f362676a36..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp16.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 8;
-
-void fp16_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const __fp16 *>(src);
-    auto pdst = static_cast<__fp16 *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++psrc;
-        ++pdst;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp
deleted file mode 100644
index f5efb2e849..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp32.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 4;
-
-void fp32_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const float *>(src);
-    auto pdst = static_cast<float *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++pdst;
-        ++psrc;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
deleted file mode 100644
index f5c63b763f..0000000000
--- a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/AssemblyUtils.h"
-
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-
-#include "depthwise_common.hpp"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-constexpr unsigned int idx_width    = 1;
-constexpr unsigned int idx_height   = 2;
-constexpr unsigned int idx_channels = 0;
-constexpr unsigned int idx_batches  = 3;
-
-template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                    const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
-{
-    unsigned int stride_cols{};
-    unsigned int stride_rows{};
-    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
-
-    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    const unsigned int kernel_cols = weights->dimension(idx_width);
-    const unsigned int kernel_rows = weights->dimension(idx_height);
-
-    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
-
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
-
-    // Configure assembly pooling kernel
-    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
-    if(dwc_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    kernel = std::move(dwc_kernel_asm);
-}
-
-template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                          const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                          std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
-                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
-{
-    unsigned int stride_cols{};
-    unsigned int stride_rows{};
-    std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
-
-    const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    const unsigned int kernel_cols = weights->dimension(idx_width);
-    const unsigned int kernel_rows = weights->dimension(idx_height);
-
-    const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
-
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
-
-    const auto src_qinfo     = src->quantization_info().uniform();
-    const auto weights_qinfo = weights->quantization_info();
-    const auto dst_qinfo     = dst->quantization_info().uniform();
-
-    const unsigned int num_filters = weights_qinfo.scale().size();
-
-    multipliers.resize(num_filters);
-    std::vector<int32_t> dst_shifts(num_filters);
-    quantization::compute_quantized_multipliers_and_shifts(src,
-                                                           weights,
-                                                           dst,
-                                                           multipliers.data(),
-                                                           dst_shifts.data());
-
-    // Quantize activation bounds
-    int32_t min_activation = std::numeric_limits<TSrc>::lowest();
-    int32_t max_activation = std::numeric_limits<TSrc>::max();
-    if(info.act_info.enabled())
-    {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
-    }
-
-    // Set quantization parameters for assembly kernels
-    arm_gemm::Requantize32 requant_args{};
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        left_shifts.resize(num_filters);
-        right_shifts.resize(num_filters);
-        bool need_left_shift = false; // Select more optimized path if left shift is not needed
-        for(unsigned int i = 0; i < num_filters; ++i)
-        {
-            left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
-            right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
-            if(dst_shifts[i] < 0 && !need_left_shift)
-            {
-                need_left_shift = true;
-            }
-        }
-
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              (need_left_shift) ? left_shifts.data() : nullptr,
-                                              right_shifts.data(),
-                                              multipliers.data(),
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
-    }
-    else
-    {
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              -dst_shifts[0],
-                                              multipliers[0],
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
-    }
-
-    // Configure assembly pooling kernel with requantization
-    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
-    if(dwc_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    kernel = std::move(dwc_kernel_asm);
-}
-} // namespace
-
-CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
-    : _kernel_asm(nullptr),
-      _multipliers(),
-      _left_shifts(),
-      _right_shifts()
-{
-}
-
-CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
-
-void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
-                                                        const ConvolutionInfo &info, const CPUInfo &cpu_info)
-{
-    ARM_COMPUTE_UNUSED(cpu_info);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // Destination initialization if not yet initialized
-    const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-#if defined(__aarch64__)
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            if(is_data_type_quantized_per_channel(weights->data_type()))
-            {
-                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
-            }
-            else
-            {
-                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
-            break;
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        case DataType::F16:
-            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm);
-            break;
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-        case DataType::F32:
-            create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm);
-            break;
-        default:
-            break;
-    }
-#endif // defined(__aarch64__)
-
-    Window win = calculate_max_window(*dst, Steps());
-    ICpuKernel::configure(win);
-}
-
-Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-#if !defined(__aarch64__)
-    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif // !defined(__aarch64__)
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
-
-    if(is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
-
-        if(is_data_type_quantized(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        }
-    }
-
-    if(dst->total_size() > 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-    return Status{};
-}
-
-void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
-    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-    ITensor       *storage   = tensors.get_tensor(TensorType::ACL_INT_1);
-
-    const auto src_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       dst_ptr        = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space  = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-    auto       parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
-
-    const auto src_shape   = src->info()->tensor_shape();
-    const auto dst_shape   = dst->info()->tensor_shape();
-    const auto src_padding = src->info()->padding();
-    const auto dst_padding = dst->info()->padding();
-
-    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
-    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
-    const size_t ld_src_batch = ld_src_row * src_shape[2];
-    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
-    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
-    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
-
-    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         parameters_ptr,
-                         dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
-}
-
-void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
-{
-    _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
-}
-
-size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const
-{
-    return _kernel_asm->get_storage_size();
-}
-
-size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
-{
-    return _kernel_asm->get_working_size(num_threads, num_input_channels);
-}
-
-bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
-{
-    return _kernel_asm != nullptr;
-}
-
-const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
-{
-    return "CpuDepthwiseConv2dAssemblyWrapperKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
deleted file mode 100644
index 8ff44441e9..0000000000
--- a/src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_conv
-{
-namespace depthwise
-{
-// Forward declarations
-class IDepthwiseCommon;
-} // depthwise
-} // arm_conv
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel
-{
-public:
-    /** Default constructor */
-    CpuDepthwiseConv2dAssemblyWrapperKernel();
-    ~CpuDepthwiseConv2dAssemblyWrapperKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel);
-
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights  Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                      Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  bias     Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                      Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] dst      Destination tensor info. Data type supported: same as @p input.
-     * @param[in]  info     Depthwise convolution layer meta-data.
-     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
-     */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure()
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-    const char *name() const override;
-
-    /** Pack bias and weights in a storage space for the assembly kernel
-     *
-     * @param[in] parameters_ptr Pointer to storage space.
-     * @param[in] bias_ptr       Pointer to bias buffer.
-     * @param[in] weights_ptr    Pointer to weights buffer.
-     * @param[in] ld_weights_col Columns displacement for the weights tensor.
-     * @param[in] ld_weights_row Rows displacement for the weights tensor.
-     */
-    void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
-
-    /** Get the amount of storage space required for the rearranged weights and bias.
-     *
-     * @return size of workspace
-     */
-    size_t get_storage_size() const;
-
-    /** Get size of the workspace needed by the assembly kernel.
-     *
-     * @param[in] num_threads        Maximum number of threads that are going to be spawned.
-     * @param[in] num_input_channels Number of channels of the input tensor.
-     *
-     * @return size of workspace
-     */
-    size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const;
-
-    /** Was the asm kernel successfully configured?
-     *
-     * @return True if the asm kernel is configured and ready to run
-     */
-    bool is_configured() const;
-
-private:
-    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> _kernel_asm;
-    std::vector<int32_t>                                   _multipliers{};
-    std::vector<int32_t>                                   _left_shifts{};
-    std::vector<int32_t>                                   _right_shifts{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
deleted file mode 100644
index 89dd27a20a..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    ARM_COMPUTE_UNUSED(cpu_info);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // dst initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
-
-#if defined(__aarch64__)
-    const bool requantize = src->quantization_info() != dst->quantization_info();
-
-    switch(src->data_type())
-    {
-        case DataType::QASYMM8:
-            if(requantize)
-            {
-                create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
-            }
-            break;
-        case DataType::QASYMM8_SIGNED:
-            if(requantize)
-            {
-                create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            else
-            {
-                create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
-            }
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            create_arm_pooling<float, float>(src, dst, info, cpu_info);
-            break;
-        default:
-            break;
-    }
-#endif // defined(__aarch64__)
-
-    Window win = calculate_max_window(*dst, Steps());
-    INEKernel::configure(win);
-}
-
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-#ifndef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
-                                    "Only AVG and MAX pooling are supported by assembly kernels");
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-        const auto src_qinfo = src->quantization_info().uniform();
-        const auto dst_qinfo = dst->quantization_info().uniform();
-
-        if(src_qinfo != dst_qinfo)
-        {
-            const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-            int32_t     dst_multiplier{};
-            int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
-        }
-        else
-        {
-            if(src->data_type() == DataType::QASYMM8)
-            {
-                const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-            }
-        }
-    }
-    else
-    {
-        if(src->data_type() == DataType::QASYMM8)
-        {
-            // If dst is not configured, the quantization info are the same
-            const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
-        }
-    }
-    return Status{};
-}
-
-void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    const ITensor *src       = tensors.get_const_tensor(TensorType::ACL_SRC);
-    ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
-    ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
-    const auto src_shape   = src->info()->tensor_shape();
-    const auto dst_shape   = dst->info()->tensor_shape();
-    const auto src_padding = src->info()->padding();
-    const auto dst_padding = dst->info()->padding();
-
-    const size_t ld_src_col   = src_shape[0] + src_padding.left + src_padding.right;
-    const size_t ld_src_row   = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
-    const size_t ld_src_batch = ld_src_row * src_shape[2];
-    const size_t ld_dst_col   = dst_shape[0] + dst_padding.left + dst_padding.right;
-    const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
-    const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
-
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
-}
-
-size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
-    return _kernel_asm->get_working_size(num_threads);
-}
-
-bool CpuPool2dAssemblyWrapperKernel::is_configured() const
-{
-    return _kernel_asm != nullptr;
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    // Configure assembly pooling kernel
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
-    arm_conv::pooling::PoolingWindow window{};
-    window.cols = static_cast<unsigned int>(info.pool_size.x());
-    window.rows = static_cast<unsigned int>(info.pool_size.y());
-
-    arm_conv::pooling::PoolingStride stride{};
-    std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
-    const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
-    constexpr unsigned int idx_width    = 1;
-    constexpr unsigned int idx_height   = 2;
-    constexpr unsigned int idx_channels = 0;
-    constexpr unsigned int idx_batches  = 3;
-
-    const unsigned int n_batches  = src->dimension(idx_batches);
-    const unsigned int src_rows   = src->dimension(idx_height);
-    const unsigned int src_cols   = src->dimension(idx_width);
-    const unsigned int n_channels = src->dimension(idx_channels);
-    const unsigned int dst_rows   = dst->dimension(idx_height);
-    const unsigned int dst_cols   = dst->dimension(idx_width);
-
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
-    const auto src_qinfo = src->quantization_info().uniform();
-    const auto dst_qinfo = dst->quantization_info().uniform();
-
-    const float multiplier = src_qinfo.scale / dst_qinfo.scale;
-    int32_t     dst_multiplier{};
-    int32_t     dst_shift{};
-    quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
-
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
-                                                       dst_shift, // left shift
-                                                       0,         // right shift
-                                                       dst_multiplier);
-
-    // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
-    {
-        // Configuration not supported: Leave function unconfigured:
-        return;
-    }
-
-    _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
deleted file mode 100644
index 3afa4c16a4..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-#include "pool_common.hpp"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific
-  * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
-  * execute a single assembly kernel in the context of an NEFunction.
-  *
-  */
-class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
-{
-public:
-    /** Constructor
-     */
-    CpuPool2dAssemblyWrapperKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
-
-    const char *name() const override
-    {
-        return "CpuPool2dAssemblyWrapperKernel";
-    }
-
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  src      Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst      Destination tensor info to store the result of pooling. Data types supported: same as @p src.
-     * @param[in]  info     Pooling meta-data.
-     * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
-     */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-    /** Get size of the workspace needed by the assembly kernel.
-     *
-     * @param[in] num_threads Maximum number of threads that are going to be spawned.
-     *
-     * @return size of workspace
-     */
-    size_t get_working_size(unsigned int num_threads) const;
-
-    /** Was the asm kernel successfully configured?
-     *
-     * @return True if the asm kernel is configured and ready to run
-     */
-    bool is_configured() const;
-
-private:
-    /** Helper function to create the assembly kernel.
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    /** Helper function to create the assembly kernel with requantization support
-     *
-     * @param[in] src  Source tensor info.
-     * @param[in] dst  Destination tensor info.
-     * @param[in] info Pooling layer meta-data.
-     */
-    template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/pool2d/neon/fp16.cpp b/src/core/cpu/kernels/pool2d/neon/fp16.cpp
deleted file mode 100644
index d21e153f25..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/fp16.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 8;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-    Iterator indices(dst1, window_out);
-
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
-    const int pad_right      = src->info()->padding().right;
-    const int pad_left       = src->info()->padding().left;
-    const int pad_horizontal = pad_right + pad_left;
-    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
-            const auto  v_x0      = vld1q_f16(in_x0_ptr);
-            const auto  v_x1      = vld1q_f16(in_x1_ptr);
-            const auto  v_x2      = vld1q_f16(in_x2_ptr);
-            const auto  v_x3      = vld1q_f16(in_x3_ptr);
-            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-            // Store indicies
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
-            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
-}
-
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
-    }
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 8;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    float16x8_t vres;
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-                const float16x8_t scale_v = vdupq_n_f16(scale);
-
-                // Perform pooling
-                vres = vdupq_n_f16(0.0f);
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
-                        {
-                            vres = vaddq_f16(vres, vmulq_f16(data, data));
-                        }
-                        else
-                        {
-                            vres = vaddq_f16(vres, data);
-                        }
-                    }
-                }
-                // Divide by scale
-                vres = vmulq_f16(vres, scale_v);
-            }
-            else
-            {
-                vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                        vres                   = vmaxq_f16(vres, data);
-                    }
-                }
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
-            }
-
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float16_t res = 0.0f;
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
-                        {
-                            res += data * data;
-                        }
-                        else
-                        {
-                            res += data;
-                        }
-                    }
-                }
-
-                // Divide by scale
-                res *= scale;
-            }
-            else
-            {
-                res = std::numeric_limits<float>::lowest();
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res                  = std::max(res, data);
-                    }
-                }
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = std::sqrt(res);
-            }
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-        }
-    },
-    in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/fp32.cpp b/src/core/cpu/kernels/pool2d/neon/fp32.cpp
deleted file mode 100644
index c82cad0ffd..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/fp32.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 4;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-    Iterator indices(dst1, window_out);
-
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
-    float32x4_t vres;
-    float       res;
-
-    const int pad_right      = src->info()->padding().right;
-    const int pad_left       = src->info()->padding().left;
-    const int pad_horizontal = pad_right + pad_left;
-    const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
-            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
-            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
-            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
-            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
-            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
-            const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
-            // Store indices
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
-            res           = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
-}
-
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        const int window_start_x = window.x().start();
-        const int window_end_x   = window.x().end();
-        const int window_step_x  = 4;
-
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator in(src, window_src);
-        Iterator out(dst0, window_out);
-
-        const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-        const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-        const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int       pool_stride_x   = 0;
-        int       pool_stride_y   = 0;
-        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        float32x4_t vres;
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const int idx_width    = id.y() * pool_stride_x;
-            const int idx_height   = id.z() * pool_stride_y;
-            const int pool_limit_y = pool_pad_top - idx_height;
-            const int pool_limit_x = pool_pad_left - idx_width;
-
-            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-            int x_off = window_start_x;
-            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-            {
-                if(pool_info.pool_type != PoolingType::MAX)
-                {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-
-                    // Perform pooling
-                    vres = vdupq_n_f32(0.0f);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                vres = vmlaq_f32(vres, data, data);
-                            }
-                            else
-                            {
-                                vres = vaddq_f32(vres, data);
-                            }
-                        }
-                    }
-                    // Divide by scale
-                    vres = vmulq_f32(vres, scale_v);
-                }
-                else
-                {
-                    vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                            vres                   = vmaxq_f32(vres, data);
-                        }
-                    }
-                }
-
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                         };
-                    vres = l2_res;
-                }
-
-                // Store result
-                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            }
-
-            // Left-overs loop
-            for(; x_off < window_end_x; ++x_off)
-            {
-                float res = 0.0f;
-
-                if(pool_info.pool_type != PoolingType::MAX)
-                {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                            pool_stride_y);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                res += data * data;
-                            }
-                            else
-                            {
-                                res += data;
-                            }
-                        }
-                    }
-
-                    // Divide by scale
-                    res *= scale;
-                }
-                else
-                {
-                    res = std::numeric_limits<float>::lowest();
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-                            res              = std::max(res, data);
-                        }
-                    }
-                }
-
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    res = std::sqrt(res);
-                }
-
-                // Store result
-                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-            }
-        },
-        in, out);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/list.h b/src/core/cpu/kernels/pool2d/neon/list.h
deleted file mode 100644
index f1e23d43cf..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/list.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/cpu/kernels/pool2d/neon/quantized.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_POOLING_KERNEL(func_name) \
-    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
-
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc);
-
-#if defined(ENABLE_NCHW_KERNELS)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw);
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-
-#undef DECLARE_POOLING_KERNEL
-
-template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
-{
-    const int pad_left    = info.padding().left;
-    const int pad_right   = info.padding().right;
-    const int pad_top     = info.padding().top;
-    const int pad_bottom  = info.padding().bottom;
-    const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
-    const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
-    const int pad_horiz   = pad_left + pad_right;
-    const int pad_vert    = pad_top + pad_bottom;
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
-                                     - pad_top * sizeof(T)                                                                       /* top padding */
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
-                                     - in_stride_w * id[3];
-
-        return offset_base;
-    }
-    else
-    {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
-                                     - pad_top * sizeof(T)                                                     // top padding
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
-                                     - in_stride_w * id[3];
-
-        return offset_base;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp
deleted file mode 100644
index bece438989..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#ifdef ENABLE_NCHW_KERNELS
-namespace arm_compute
-{
-namespace cpu
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    ARM_COMPUTE_UNUSED(pool_info.pool_type);
-    ARM_COMPUTE_UNUSED(pool_info.exclude_padding);
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
-        float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()));
-        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
-        float16x4_t res         = {};
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmul_f16(top_data, top_data);
-            middle_data = vmul_f16(middle_data, middle_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
-            // Perform pooling
-            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
-            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
-            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
-        }
-        else
-        {
-            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
-            res                        = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
-            res                        = vpmax_f16(res, res);
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = vinv_f16(vinvsqrt_f16(res));
-        }
-
-        *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-    },
-    in, out);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
-{
-    float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
-    return out;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
-{
-    return in;
-}
-
-template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    Iterator  in(src, window_src);
-    Iterator  out(dst0, window);
-    Iterator  indices(dst1, window);
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left       = src->info()->padding().left;
-    const int            pad_right      = src->info()->padding().right;
-    const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + in.offset()));
-        auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + in.offset()));
-        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
-        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
-        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
-        // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
-        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
-        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
-        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
-        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
-        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
-    },
-    in, out, indices);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x, pool_stride_y = 0;
-        std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
-            float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
-            float16x4_t res         = {};
-
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                top_data    = vmul_f16(top_data, top_data);
-                bottom_data = vmul_f16(bottom_data, bottom_data);
-            }
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-                const float16x4_t scale_v = vdup_n_f16(scale);
-
-                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-                res                        = vpmax_f16(max_data, max_data);
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = vinv_f16(vinvsqrt_f16(res));
-            }
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-        },
-        in, out);
-    }
-}
-
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16_t   res  = 0.0f;
-        float16x8_t vres = vdupq_n_f16(0.0f);
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling and accumulate
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        vres = vaddq_f16(vres, vmulq_f16(data, data));
-                    }
-                    else
-                    {
-                        vres = vaddq_f16(vres, data);
-                    }
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                           + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
-                }
-            }
-
-            // Reduction
-            float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
-            res += vget_lane_f16(tmp, 0);
-            res += vget_lane_f16(tmp, 1);
-            res += vget_lane_f16(tmp, 2);
-            res += vget_lane_f16(tmp, 3);
-
-            // Divide by scale
-            res *= scale;
-        }
-        else
-        {
-            float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-            res              = std::numeric_limits<float>::lowest();
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (src->info()->strides_in_bytes().y())));
-                    vres                   = vmaxq_f16(vres, data);
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                                 + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
-                    res = std::max(res, data);
-                }
-            }
-
-            float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
-            res             = std::max(res, vget_lane_f16(tmp, 0));
-            res             = std::max(res, vget_lane_f16(tmp, 1));
-            res             = std::max(res, vget_lane_f16(tmp, 2));
-            res             = std::max(res, vget_lane_f16(tmp, 3));
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float16_t *>(out.ptr())) = res;
-    },
-    in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-            float32x4_t vres = vdupq_n_f32(0.0f);
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 4); x += 4)
-                {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling and accumulate
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        vres = vmlaq_f32(vres, data, data);
-                    }
-                    else
-                    {
-                        vres = vaddq_f32(vres, data);
-                    }
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                   (src->info()->strides_in_bytes().y())));
-
-                    // Get power of 2 in case of l2 pooling
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
-
-                    res += data;
-                }
-            }
-
-#if defined(__aarch64__)
-            // Reduction operation available on 64 bit architectures only
-            res += vaddvq_f32(vres);
-#else  // __aarch64__
-            // Reduction
-            float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
-            tmp             = vpadd_f32(tmp, tmp);
-
-            res += vget_lane_f32(tmp, 0);
-#endif // __aarch64__
-            // Divide by scale
-            res *= scale;
-        }
-        else
-        {
-            float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-            res              = std::numeric_limits<float>::lowest();
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 4); x += 4)
-                {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().y())));
-                    vres                   = vmaxq_f32(vres, data);
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                         (src->info()->strides_in_bytes().y())));
-                    res              = std::max(res, data);
-                }
-            }
-#if defined(__aarch64__)
-            // Reduction operation available on 64 bit architectures only
-            res = std::max(vmaxvq_f32(vres), res);
-#else  // __aarch64__
-            float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
-            tmp             = vpmax_f32(tmp, tmp);
-
-            res = std::max(res, vget_lane_f32(tmp, 0));
-#endif // __aarch64__
-        }
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = res;
-    },
-    in, out);
-}
-
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
-    {
-        pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
-    }
-    else
-    {
-        Iterator      in(src, window_src);
-        Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x   = 0;
-        int           pool_stride_y   = 0;
-        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-        const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto  in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
-            const auto  in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-            float32x2_t top_data      = vld1_f32(in_top_ptr);
-            float32x2_t bottom_data   = vld1_f32(in_bottom_ptr);
-            float32x2_t res           = {};
-            float       final_res     = 0;
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                top_data    = vmul_f32(top_data, top_data);
-                bottom_data = vmul_f32(bottom_data, bottom_data);
-            }
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                  pool_stride_y);
-                const float32x2_t scale_v = vdup_n_f32(scale);
-
-                // Perform pooling
-                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-                res                        = vpmax_f32(max_data, max_data);
-            }
-            final_res = vget_lane_f32(res, 0);
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                final_res = sqrt(final_res);
-            }
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr())) = final_res;
-        },
-        in, out);
-    }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + in.offset()));
-        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + in.offset()));
-        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + in.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                              pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
-}
-
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
-    {
-        src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
-    }
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t res       = {};
-        float       final_res = 0.f;
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                              pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
-                // Get power of 2 in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-                }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
-            }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
-                max_data                 = vmax2q_f32(max_data, data);
-            }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_NCHW_KERNELS
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp
deleted file mode 100644
index 4020e9e3fc..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/qasymm8.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
deleted file mode 100644
index a899427484..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pool2d/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/pool2d/neon/quantized.h b/src/core/cpu/kernels/pool2d/neon/quantized.h
deleted file mode 100644
index a16960a205..0000000000
--- a/src/core/cpu/kernels/pool2d/neon/quantized.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
-    return quantize_qasymm8(val, info);
-}
-
-template <typename T>
-inline T vcvtq_q32_f32(float32x4_t values);
-
-template <>
-inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_u32_f32(values);
-}
-
-template <>
-inline int32x4_t vcvtq_q32_f32(float32x4_t values)
-{
-    return vcvtq_s32_f32(values);
-}
-
-template <typename T>
-inline float32x4_t vcvtq_f32_q32(T values);
-
-template <>
-inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
-{
-    return vcvtq_f32_u32(values);
-}
-
-template <>
-inline float32x4_t vcvtq_f32_q32(int32x4_t values)
-{
-    return vcvtq_f32_s32(values);
-}
-
-template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
-
-template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
-    const float new_scale = quant_rescale / scale_pooling;
-    return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <typename Tin, typename Tout>
-inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-template <typename T>
-inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
-    return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
-    return vquantize_signed(acc, requant_qinfo);
-}
-
-inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int start_x = id[idx_width] * stride_x - pad_x;
-    int start_y = id[idx_height] * stride_y - pad_y;
-
-    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_x = std::max(0, start_x);
-        start_y = std::max(0, start_y);
-    }
-    return 1.f / ((end_y - start_y) * (end_x - start_x));
-}
-
-template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-
-    const int window_start_x     = window.x().start();
-    const int window_end_x       = window.x().end();
-    const int window_step_x      = 16;
-    const int window_half_step_x = window_step_x / 2;
-
-    Window window_out = window;
-    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, window_src);
-    Iterator out(dst0, window_out);
-
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q16_t   = typename wrapper::traits::promote_t<T>;
-    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
-    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
-    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
-
-    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
-    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
-    // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
-                    }
-                }
-
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float32x4x4_t vres =
-                    {
-                        {
-                            vcvtq_f32_q32(vres1),
-                            vcvtq_f32_q32(vres2),
-                            vcvtq_f32_q32(vres3),
-                            vcvtq_f32_q32(vres4),
-                        }
-                    };
-                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-                }
-                else
-                {
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-                }
-            }
-            else
-            {
-                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-                        vres               = wrapper::vmax(vres, data);
-                    }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) :
-                                vres);
-            }
-        }
-
-        if(pool_info.pool_type == PoolingType::MAX)
-        {
-            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-            {
-                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().z())) + x_off);
-                        vres              = wrapper::vmax(vres, data);
-                    }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-            }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                q32_t res = static_cast<q32_t>(0.f);
-
-                // Calculate scale
-                const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res += data;
-                    }
-                }
-
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f           = static_cast<float>(res);
-                    const float new_scale       = quant_rescale / scale;
-                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-                }
-                else
-                {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-            else
-            {
-                T res = std::numeric_limits<T>::min();
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res          = std::max(res, data);
-                    }
-                }
-
-                // Store result
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f                           = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-        }
-
-    },
-    in, out);
-}
-
-#if defined(ENABLE_NCHW_KERNELS)
-template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
-                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = (id.x() + id_offset) * stride_x - pad_x;
-    int       start_y = id.y() * stride_y - pad_y;
-    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    if(exclude_padding)
-    {
-        start_y = std::max(0, start_y);
-    }
-
-    std::array<T, 8> elems =
-    {
-        {
-            wrapper::vgetlane(v, 0),
-            wrapper::vgetlane(v, 1),
-            wrapper::vgetlane(v, 2),
-            wrapper::vgetlane(v, 3),
-            wrapper::vgetlane(v, 4),
-            wrapper::vgetlane(v, 5),
-            wrapper::vgetlane(v, 6),
-            wrapper::vgetlane(v, 7),
-        }
-    };
-
-    for(auto &el : elems)
-    {
-        int       c_start_x = start_x;
-        const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
-        if(exclude_padding)
-        {
-            c_start_x = std::max(0, c_start_x);
-        }
-        float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
-        el *= scale;
-        start_x += step * stride_x;
-    }
-
-    v = wrapper::vsetlane(elems[0], v, 0);
-    v = wrapper::vsetlane(elems[1], v, 1);
-    v = wrapper::vsetlane(elems[2], v, 2);
-    v = wrapper::vsetlane(elems[3], v, 3);
-    v = wrapper::vsetlane(elems[4], v, 4);
-    v = wrapper::vsetlane(elems[5], v, 5);
-    v = wrapper::vsetlane(elems[6], v, 6);
-    v = wrapper::vsetlane(elems[7], v, 7);
-}
-
-template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using q16_t     = typename wrapper::traits::promote_t<T>;
-    using q16x4_t   = typename wrapper::traits::neon_vector<q16_t, 4>::type;
-    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
-    constexpr int pool_size       = 2;
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-
-    const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
-
-    const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
-    const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = wrapper::vloadq(src_top_ptr + in.offset());
-        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
-        q8x8_t     lower_res   = {};
-        q8x8_t     upper_res   = {};
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Add rows
-            const q16x8x2_t vrsum =
-            {
-                {
-                    wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
-                    wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
-                }
-            };
-
-            // Pair-wise add row data
-            const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
-            const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
-
-            q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
-
-            // Scale lower result
-            scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
-                                               pool_size, upper_bound_w, upper_bound_h,
-                                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            lower_res = wrapper::vmovn(res_lower);
-
-            // Compute upper result for stride_x == 1
-            if(pool_stride_x == 1)
-            {
-                // Shifted row sum
-                const q16x8x2_t vrsum_shifted =
-                {
-                    {
-                        wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                        wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                    }
-                };
-
-                // Pair-wise add shifted row
-                q16x8_t res_upper = wrapper::vcombine(
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
-                // Scale upper result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                upper_res = wrapper::vmovn(res_upper);
-            }
-        }
-        else
-        {
-            const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
-            lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
-            if(pool_stride_x == 1)
-            {
-                const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
-                upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
-            }
-        }
-
-        if(have_different_qinfo)
-        {
-            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                  = wrapper::vgetlow(requantized_dst);
-            upper_res                  = wrapper::vgethigh(requantized_dst);
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            const q8x8x2_t res = { { lower_res, upper_res } };
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), res);
-        }
-        else
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), lower_res);
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t    = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q8x16_t   = typename wrapper::traits::neon_vector<T, 16>::type;
-    using q8x8x2_t  = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using q16_t     = typename wrapper::traits::promote_t<T>;
-    using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
-    constexpr int pool_size       = 3;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto top_data    = wrapper::vloadq(src_top_ptr + in.offset());
-        const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset());
-        const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
-        q8x8_t     fres        = {};
-        q8x16_t    fqres       = {};
-
-        if(pool_info.pool_type == PoolingType::AVG)
-        {
-            // Convert data to u16
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Calculate row sums
-            const q16x8x2_t vrsum =
-            {
-                {
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
-                }
-            };
-            const q16x8x2_t vrsum_shifted_1 =
-            {
-                {
-                    wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            const q16x8x2_t vrsum_shifted_2 =
-            {
-                {
-                    wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_2(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            // Calculate final sum
-            q16x8x2_t final_sum =
-            {
-                {
-                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
-                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
-                }
-            };
-            if(pool_stride_x == 2)
-            {
-                q16x8_t res =
-                {
-                    wrapper::vgetlane(final_sum.val[0], 0),
-                    wrapper::vgetlane(final_sum.val[0], 2),
-                    wrapper::vgetlane(final_sum.val[0], 4),
-                    wrapper::vgetlane(final_sum.val[0], 6),
-                    wrapper::vgetlane(final_sum.val[1], 0),
-                    wrapper::vgetlane(final_sum.val[1], 2),
-                    wrapper::vgetlane(final_sum.val[1], 4),
-                    wrapper::vgetlane(final_sum.val[1], 6),
-                };
-
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fres = wrapper::vmovn(res);
-            }
-            else
-            {
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
-            }
-        }
-        else
-        {
-            const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
-            const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
-            const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
-            const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
-
-            if(pool_stride_x == 2)
-            {
-                const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
-                static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                fres                           = wrapper::vtbl(table, lookup_val);
-            }
-            else
-            {
-                fqres = final_max;
-            }
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
-            }
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fqres);
-        }
-        else
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
-            }
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fres);
-        }
-    },
-    in, out);
-}
-
-template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    /** SIMD vector types */
-    using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
-    using q16_t   = typename wrapper::traits::promote_t<T>;
-    using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
-    using q32_t   = typename wrapper::traits::promote_t<q16_t>;
-    using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
-    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        T res = std::numeric_limits<T>::min();
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32_t   sres = 0;
-
-            // Calculate scale
-            const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (src->info()->strides_in_bytes().y())));
-
-                    const q16x8_t data_q16 = wrapper::vmovl(data);
-                    vres                   = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
-                }
-
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                           (src->info()->strides_in_bytes().y())));
-                    sres += data;
-                }
-            }
-
-            // Reduction
-            const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
-            sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
-
-            // Divide by scale
-            res = static_cast<T>(support::cpp11::round(sres * scale));
-        }
-        else
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                int x = 0;
-                for(; x <= (pool_size_x - 8); x += 8)
-                {
-                    const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                                   (src->info()->strides_in_bytes().y())));
-                    vres              = wrapper::vmax(vres, data);
-                }
-                // Leftover for loop
-                for(; x < pool_size_x; ++x)
-                {
-                    const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
-                                                                 (src->info()->strides_in_bytes().y())));
-                    res          = std::max(res, data);
-                }
-            }
-
-            // Reduce max
-            vres = wrapper::vpmax(vres, vres);
-            vres = wrapper::vpmax(vres, vres);
-            vres = wrapper::vpmax(vres, vres);
-
-            // Get max value
-            res = std::max(res, wrapper::vgetlane(vres, 0));
-        }
-        // Store result
-        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
-        *(reinterpret_cast<T *>(out.ptr())) = res;
-    },
-    in, out);
-}
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
diff --git a/src/core/cpu/kernels/scale/neon/fp16.cpp b/src/core/cpu/kernels/scale/neon/fp16.cpp
deleted file mode 100644
index 0ad66cab1c..0000000000
--- a/src/core/cpu/kernels/scale/neon/fp16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                             float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 8;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int        offset_row = in_hi * in_stride_wc;
-        int32_t          x          = window_start_x;
-        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
-        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                     bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/integer.cpp b/src/core/cpu/kernels/scale/neon/integer.cpp
deleted file mode 100644
index a2359aac94..0000000000
--- a/src/core/cpu/kernels/scale/neon/integer.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 16;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 8;
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/list.h b/src/core/cpu/kernels/scale/neon/list.h
deleted file mode 100644
index c91242f5b2..0000000000
--- a/src/core/cpu/kernels/scale/neon/list.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
-
-#undef DECLARE_SCALE_KERNEL
-
-template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
-                        bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        int32_t       x          = window_start_x;
-        const T      *in_ptr     = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
-}
-
-template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                         BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                         bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto    offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const T      *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8.cpp b/src/core/cpu/kernels/scale/neon/qasymm8.cpp
deleted file mode 100644
index 90302ce889..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                 bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(1, Window::Dimension(0, 0, 0));
-    win_in.set(2, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(1);
-    const int32_t in_dim_h = src->info()->dimension(2);
-    const int32_t stride_w = src->info()->strides_in_bytes()[1];
-    const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                        bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
deleted file mode 100644
index 07d6c6ef03..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                        bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(1, Window::Dimension(0, 0, 0));
-    win_in.set(2, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(1);
-    const int32_t in_dim_h = src->info()->dimension(2);
-    const int32_t stride_w = src->info()->strides_in_bytes()[1];
-    const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                               InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                               bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp
deleted file mode 100644
index 76e7735b8a..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp16.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
-
-void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
-        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp
deleted file mode 100644
index 030e109cdf..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp32.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace
-{
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    out);
-}
-
-void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const float const_border_value = static_cast<float>(constant_border_value.get<float>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto    offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto    dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float *in_ptr = reinterpret_cast<const float *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp
deleted file mode 100644
index 486c674612..0000000000
--- a/src/core/cpu/kernels/scale/sve/integer.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                          float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void u8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                           BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                           bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
-
-void s16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
-{
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
-    Iterator  out(dst, window);
-    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const int in_dim_w     = src->info()->dimension(1);
-    const int in_dim_h     = src->info()->dimension(2);
-    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
-    // Don't increment in Y and Z direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    Iterator in(src, win_in);
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                  InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                  bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        u8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        s16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/list.h b/src/core/cpu/kernels/scale/sve/list.h
deleted file mode 100644
index b9c3a10a78..0000000000
--- a/src/core/cpu/kernels/scale/sve/list.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(fp16_sve_scale);
-DECLARE_SCALE_KERNEL(fp32_sve_scale);
-DECLARE_SCALE_KERNEL(s16_sve_scale);
-DECLARE_SCALE_KERNEL(u8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale);
-
-#undef DECLARE_SCALE_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
deleted file mode 100644
index c9122ad40b..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                               float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    const int idx_width  = 1;
-    const int idx_height = 2;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
deleted file mode 100644
index 0843e61fd4..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                                      float sampling_offset, bool align_corners, const Window &window)
-{
-    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
-    const size_t in_stride_wc = in_stride_w * in_stride_c;
-    const size_t in_dim_h     = src->info()->dimension(2);
-
-    // Compute the ratio between source height and destination height
-    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator out(dst, win);
-
-    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            // Store results
-            svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
-
-void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                       BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                       bool align_corners, const Window &window)
-{
-    // Data layout is NHWC
-    const int idx_width  = 1;
-    const int idx_height = 2;
-
-    // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
-    Window     win_off;
-    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(idx_width, Window::Dimension(0, 0, 0));
-    win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(src, win_in);
-    Iterator out(dst, window);
-
-    const int32_t in_dim_w = src->info()->dimension(idx_width);
-    const int32_t in_dim_h = src->info()->dimension(idx_height);
-    const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
-    const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
-    const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else if(border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
-{
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        qasymm8_signed_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
-    }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h
deleted file mode 100644
index 5ebee31272..0000000000
--- a/src/core/cpu/kernels/softmax/impl/neon/list.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
-{
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
-                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
-
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
-
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
-    constexpr int vec_size = 16;
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
-
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
-            {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
-                if(is_log)
-                {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
-                {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
-                }
-
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
-
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
-                {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
-                }
-
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
-            {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
-                {
-                    const float32x4x4_t sub =
-                    {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
-                }
-                else
-                {
-                    float32x4x4_t mul =
-                    {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
-                    }
-
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
-                }
-                else
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
-                }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
-            }
-
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
-            {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
-
-                if(is_log)
-                {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
-                }
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
-                {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
-                }
-                else
-                {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = tmp_ptr[x] - sum;
-                }
-                else
-                {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
deleted file mode 100644
index 7a577fd565..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
-
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-
-        auto max_val = svmaxv(all_true_pg, vec_max);
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
-
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
-                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
-                }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
-                {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                 ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                     ITensor *out, const float beta, bool is_log, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h
deleted file mode 100644
index b4e1e1b186..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/list.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-template <typename ScalarType>
-void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                     ITensor *out, float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = svdup_n_f32(scale_beta);
-
-    Iterator   in_it(in, window);
-    Iterator   max_it(max, window);
-    Iterator   out_it(out, window);
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-    using SVEType          = typename wrapper::traits::sve_vector<ScalarType>::type;
-
-    const int inc_1 = static_cast<int>(svcntw());
-    const int inc_2 = static_cast<int>(2 * svcntw());
-    const int inc_3 = static_cast<int>(3 * svcntw());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
-
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_max, vec_elements);
-
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements)));
-
-                if(is_log)
-                {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
-                }
-                else
-                {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
-                }
-
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
-            {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
-                {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, vec_in_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, vec_in_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, vec_in_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, vec_in_3, offset_vec);
-                    }
-                }
-
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-
-#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h
deleted file mode 100644
index ac1346001a..0000000000
--- a/src/core/cpu/kernels/sub/neon/list.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H
-#define SRC_CORE_NEON_KERNELS_SUB_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SUB_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_SUB_KERNEL(sub_qasymm8_neon);
-DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
-DECLARE_SUB_KERNEL(sub_qsymm16_neon);
-
-#undef DECLARE_SUB_KERNEL
-
-template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    bool is_sat = policy == ConvertPolicy::SATURATE;
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    constexpr int window_step_x         = 16 / sizeof(T);
-    const auto    window_start_x        = static_cast<int>(window.x().start());
-    const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
-
-            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                if(is_broadcast_input_2)
-                {
-                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
-                }
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                if(is_broadcast_input_2)
-                {
-                    res = static_cast<T>(-1) * res;
-                }
-
-                *(output_ptr + x) = res;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8.cpp b/src/core/cpu/kernels/sub/neon/qasymm8.cpp
deleted file mode 100644
index 8f4cd8bdbb..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-        const float32x4_t             vscale1              = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t             vscale2              = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t               voffset1             = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t               voffset2             = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(static_cast<uint8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
-            const float32x4x4_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                }
-            };
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64_
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                const float bfs   = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(input1_ptr + x);
-                const auto b = wrapper::vloadq(input2_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const float32x4x4_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
-                *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
deleted file mode 100644
index 2c9e411743..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-        const float32x4_t             vscale1              = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
-        const float32x4_t             vscale2              = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
-        const int32x4_t               voffset1             = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
-        const int32x4_t               voffset2             = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(static_cast<int8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
-            const float32x4x4_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
-                }
-            };
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64_
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
-                const float bfs   = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
-        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
-        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
-        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto a = wrapper::vloadq(input1_ptr + x);
-                const auto b = wrapper::vloadq(input2_ptr + x);
-
-                const float32x4x4_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
-                    }
-                };
-
-                const float32x4x4_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
-                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-                const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
-                const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
-                *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qsymm16.cpp b/src/core/cpu/kernels/sub/neon/qsymm16.cpp
deleted file mode 100644
index 4dfdc0e78c..0000000000
--- a/src/core/cpu/kernels/sub/neon/qsymm16.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(policy);
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 8;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
-
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
-    if(is_broadcast_across_x)
-    {
-        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const float32x4x2_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
-                }
-            };
-            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
-                const float32x4x2_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
-
-                const int32x4x4_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const float32x4x2_t af =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
-
-                const float32x4x2_t bf =
-                {
-                    {
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
-                    }
-                };
-
-                const int32x4x2_t rf =
-                {
-                    {
-#ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
-                    }
-                };
-
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/ClCompileContext.h b/src/core/gpu/cl/ClCompileContext.h
deleted file mode 100644
index e69cc0200f..0000000000
--- a/src/core/gpu/cl/ClCompileContext.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H
-#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using ClCompileContext = arm_compute::CLCompileContext;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
deleted file mode 100644
index 4a9ba874b1..0000000000
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/ClKernelLibrary.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-
-#include <algorithm>
-#include <array>
-#include <fstream>
-#include <utility>
-
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-#include <zlib.h>
-
-namespace
-{
-/* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/** Decode a base64 encoded string
- *
- * @param[in] str Base64 encoded string to decode
- *
- * @return The decode string in case of a valid, non-empty string otherwise an empty string
- */
-std::string decode_base64(const std::string &str)
-{
-    constexpr const char pad_char = '=';
-
-    // Handle empty string
-    if(str.empty())
-    {
-        return {};
-    }
-
-    // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
-    {
-        return {};
-    }
-
-    //
-    // Check encoded string padding
-    std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char);
-    const int   str_len = str.size();
-
-    // Reserve memory for the decoded string
-    // Note each 4 consecutive elements of 6-bit encode 3 bytes
-    std::string dec_b64;
-    dec_b64.reserve(((str_len / 4) * 3));
-
-    // Block decoding function (exclude padding)
-    int       c   = 0;
-    const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-        const int byte2 = b64_invtab[str[c + 2]];
-        const int byte3 = b64_invtab[str[c + 3]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
-        dec_b64.push_back((byte2 << 6) | (byte3));
-    }
-
-    // Last step that might contain padding symbols
-    if(padding == 1)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-        const int byte2 = b64_invtab[str[c + 2]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
-    }
-    else if(padding == 2)
-    {
-        const int byte0 = b64_invtab[str[c]];
-        const int byte1 = b64_invtab[str[c + 1]];
-
-        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
-    }
-
-    return dec_b64;
-}
-
-/** Decompress a zlib compressed string
- *
- * @param[in] str ZLib compressed string
- *
- * @return The decompressed string if successful, otherwise false.
- */
-std::string decompress_zlib(const std::string &str)
-{
-    // Create and initialize decompression stream
-    z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
-    {
-        return std::string();
-    }
-    ds.avail_in = str.size();
-    ds.next_in  = (Bytef *)str.data();
-
-    // Roll-over the string using a buffer and decompress
-    int         status = Z_OK;
-    char        roll_buff[16384];
-    std::string inflated_str;
-    do
-    {
-        ds.avail_out = sizeof(roll_buff);
-        ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
-
-        status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
-        {
-            inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
-        }
-    }
-    while(status == Z_OK);
-
-    // Finalize decompression stream
-    inflateEnd(&ds);
-    if(status != Z_STREAM_END)
-    {
-        return std::string();
-    }
-
-    return inflated_str;
-}
-} // namespace
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-namespace arm_compute
-{
-namespace opencl
-{
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
-    // Common Kernels
-    { "activation_layer", "common/activation_layer.cl" },
-    { "activation_layer_quant", "common/activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
-    { "arg_min_max_x", "common/arg_min_max.cl" },
-    { "arg_min_max_y", "common/arg_min_max.cl" },
-    { "arg_min_max_z", "common/arg_min_max.cl" },
-    { "arg_min_max_w", "common/arg_min_max.cl" },
-    { "bitwise_or", "common/bitwise_op.cl" },
-    { "bitwise_and", "common/bitwise_op.cl" },
-    { "bitwise_xor", "common/bitwise_op.cl" },
-    { "bitwise_not", "common/bitwise_op.cl" },
-    { "bounding_box_transform", "common/bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
-    { "compare_equal", "common/comparisons.cl" },
-    { "compare_equal_quantized", "common/comparisons.cl" },
-    { "compare_notequal", "common/comparisons.cl" },
-    { "compare_notequal_quantized", "common/comparisons.cl" },
-    { "compare_greater", "common/comparisons.cl" },
-    { "compare_greater_quantized", "common/comparisons.cl" },
-    { "compare_greaterequal", "common/comparisons.cl" },
-    { "compare_greaterequal_quantized", "common/comparisons.cl" },
-    { "compare_less", "common/comparisons.cl" },
-    { "compare_less_quantized", "common/comparisons.cl" },
-    { "compare_lessequal", "common/comparisons.cl" },
-    { "compare_lessequal_quantized", "common/comparisons.cl" },
-    { "concatenate", "common/concatenate.cl" },
-    { "concatenate_width", "common/concatenate.cl" },
-    { "concatenate_height", "common/concatenate.cl" },
-    { "concatenate_width_x2", "common/concatenate.cl" },
-    { "concatenate_width_x4", "common/concatenate.cl" },
-    { "col2im", "common/col2im.cl" },
-    { "cast_down", "common/cast.cl" },
-    { "cast_up", "common/cast.cl" },
-    { "convert_fc_weights", "common/convert_fc_weights.cl" },
-    { "copy_tensor", "common/copy_tensor.cl" },
-    { "crop_tensor", "common/crop_tensor.cl" },
-    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
-    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
-    { "dequantization_layer", "common/dequantization_layer.cl" },
-    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
-    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
-    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "common/elementwise_unary.cl" },
-    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_2_axis_0", "common/fft.cl" },
-    { "fft_radix_2_axis_1", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_3_axis_0", "common/fft.cl" },
-    { "fft_radix_3_axis_1", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_4_axis_0", "common/fft.cl" },
-    { "fft_radix_4_axis_1", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_5_axis_0", "common/fft.cl" },
-    { "fft_radix_5_axis_1", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_7_axis_0", "common/fft.cl" },
-    { "fft_radix_7_axis_1", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_8_axis_0", "common/fft.cl" },
-    { "fft_radix_8_axis_1", "common/fft.cl" },
-    { "fft_scale_conj", "common/fft_scale.cl" },
-    { "fill_image_borders_constant", "common/fill_border.cl" },
-    { "fill_image_borders_replicate", "common/fill_border.cl" },
-    { "floor_layer", "common/floor.cl" },
-    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
-    { "gather", "common/gather.cl" },
-    { "gemm_ma_f16", "common/gemm.cl" },
-    { "gemm_ma_f32", "common/gemm.cl" },
-    { "gemm_mv", "common/gemv.cl" },
-    { "gemm_mv_quantized", "common/gemv.cl" },
-    { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" },
-    { "gemm_mm_floating_point", "common/gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" },
-    { "gemm_mm_native", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" },
-    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
-    { "instance_normalization", "common/instance_normalization.cl" },
-    { "compute_mean_var", "common/instance_normalization.cl" },
-    { "l2_normalize_x", "common/l2_normalize.cl" },
-    { "l2_normalize_y", "common/l2_normalize.cl" },
-    { "l2_normalize_z", "common/l2_normalize.cl" },
-    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
-    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
-    { "memset", "common/memset.cl" },
-    { "minmax_layer", "common/minmax_layer.cl" },
-    { "non_max_suppression", "common/nonmax.cl" },
-    { "pad_layer_constant", "common/pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
-    { "permute", "common/permute.cl" },
-    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "pooling_layer_2", "common/pooling_layer.cl" },
-    { "pooling_layer_3", "common/pooling_layer.cl" },
-    { "pooling_layer_optimized_3", "common/pooling_layer.cl" },
-    { "pooling_layer_7", "common/pooling_layer.cl" },
-    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
-    { "quantization_layer", "common/quantization_layer.cl" },
-    { "range", "common/range.cl" },
-    { "range_quantized", "common/range.cl" },
-    { "reduction_operation_x", "common/reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
-    { "reduction_operation_y", "common/reduction_operation.cl" },
-    { "reduction_operation_z", "common/reduction_operation.cl" },
-    { "reduction_operation_w", "common/reduction_operation.cl" },
-    { "reshape_layer", "common/reshape_layer.cl" },
-    { "reshape_to_columns", "common/convolution_layer.cl" },
-    { "reverse", "common/reverse.cl" },
-    { "roi_align_layer", "common/roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
-    { "select_same_rank", "common/select.cl" },
-    { "select_different_rank_2", "common/select.cl" },
-    { "select_different_rank_n", "common/select.cl" },
-    { "softmax_layer_norm", "common/softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
-    { "stack_layer", "common/stack_layer.cl" },
-    { "strided_slice", "common/slice_ops.cl" },
-    { "tile", "common/tile.cl" },
-    { "transpose", "common/transpose.cl" },
-#ifdef ENABLE_NCHW_KERNELS
-    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
-    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
-    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
-    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" },
-    { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" },
-    { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" },
-    { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" },
-    { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" },
-    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
-    { "im2col3x3_nchw", "nchw/im2col.cl" },
-    { "im2col5x5_nchw", "nchw/im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" },
-    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "remap_nearest_neighbour_nchw", "nchw/remap.cl" },
-    { "remap_bilinear_nchw", "nchw/remap.cl" },
-    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
-    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
-#endif /* ENABLE_NCHW_KERNELS */
-#ifdef ENABLE_NHWC_KERNELS
-    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
-    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
-    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
-    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
-    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
-    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
-    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
-    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" },
-    { "remap_bilinear_nhwc", "nhwc/remap.cl" },
-    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
-    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
-#endif /* ENABLE_NHWC_KERNELS */
-};
-
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
-#ifdef EMBEDDED_KERNELS
-    {
-        "common/activation_layer.cl",
-#include "./cl_kernels/common/activation_layer.clembed"
-    },
-    {
-        "common/activation_layer_quant.cl",
-#include "./cl_kernels/common/activation_layer_quant.clembed"
-    },
-    {
-        "common/arg_min_max.cl",
-#include "./cl_kernels/common/arg_min_max.clembed"
-    },
-    {
-        "common/bitwise_op.cl",
-#include "./cl_kernels/common/bitwise_op.clembed"
-    },
-    {
-        "common/bounding_box_transform.cl",
-#include "./cl_kernels/common/bounding_box_transform.clembed"
-    },
-    {
-        "common/bounding_box_transform_quantized.cl",
-#include "./cl_kernels/common/bounding_box_transform_quantized.clembed"
-    },
-    {
-        "common/col2im.cl",
-#include "./cl_kernels/common/col2im.clembed"
-    },
-    {
-        "common/comparisons.cl",
-#include "./cl_kernels/common/comparisons.clembed"
-    },
-    {
-        "common/concatenate.cl",
-#include "./cl_kernels/common/concatenate.clembed"
-    },
-    {
-        "common/convert_fc_weights.cl",
-#include "./cl_kernels/common/convert_fc_weights.clembed"
-    },
-    {
-        "common/convolution_layer.cl",
-#include "./cl_kernels/common/convolution_layer.clembed"
-    },
-    {
-        "common/copy_tensor.cl",
-#include "./cl_kernels/common/copy_tensor.clembed"
-    },
-    {
-        "common/crop_tensor.cl",
-#include "./cl_kernels/common/crop_tensor.clembed"
-    },
-    {
-        "common/deconvolution_layer.cl",
-#include "./cl_kernels/common/deconvolution_layer.clembed"
-    },
-    {
-        "common/cast.cl",
-#include "./cl_kernels/common/cast.clembed"
-    },
-    {
-        "common/dequantization_layer.cl",
-#include "./cl_kernels/common/dequantization_layer.clembed"
-    },
-    {
-        "common/elementwise_operation.cl",
-#include "./cl_kernels/common/elementwise_operation.clembed"
-    },
-    {
-        "common/elementwise_operation_quantized.cl",
-#include "./cl_kernels/common/elementwise_operation_quantized.clembed"
-    },
-    {
-        "common/elementwise_unary.cl",
-#include "./cl_kernels/common/elementwise_unary.clembed"
-    },
-    {
-        "common/fft.cl",
-#include "./cl_kernels/common/fft.clembed"
-    },
-    {
-        "common/fft_digit_reverse.cl",
-#include "./cl_kernels/common/fft_digit_reverse.clembed"
-    },
-    {
-        "common/fft_scale.cl",
-#include "./cl_kernels/common/fft_scale.clembed"
-    },
-    {
-        "common/fill_border.cl",
-#include "./cl_kernels/common/fill_border.clembed"
-    },
-    {
-        "common/floor.cl",
-#include "./cl_kernels/common/floor.clembed"
-    },
-    {
-        "common/gather.cl",
-#include "./cl_kernels/common/gather.clembed"
-    },
-    {
-        "common/gemm.cl",
-#include "./cl_kernels/common/gemm.clembed"
-    },
-    {
-        "common/gemm_v1.cl",
-#include "./cl_kernels/common/gemm_v1.clembed"
-    },
-    {
-        "common/gemmlowp.cl",
-#include "./cl_kernels/common/gemmlowp.clembed"
-    },
-    {
-        "common/gemv.cl",
-#include "./cl_kernels/common/gemv.clembed"
-    },
-    {
-        "common/generate_proposals.cl",
-#include "./cl_kernels/common/generate_proposals.clembed"
-    },
-    {
-        "common/generate_proposals_quantized.cl",
-#include "./cl_kernels/common/generate_proposals_quantized.clembed"
-    },
-    {
-        "helpers.h",
-#include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
-#include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "common/instance_normalization.cl",
-#include "./cl_kernels/common/instance_normalization.clembed"
-    },
-    {
-        "common/l2_normalize.cl",
-#include "./cl_kernels/common/l2_normalize.clembed"
-    },
-    {
-        "common/mean_stddev_normalization.cl",
-#include "./cl_kernels/common/mean_stddev_normalization.clembed"
-    },
-    {
-        "common/memset.cl",
-#include "./cl_kernels/common/memset.clembed"
-    },
-    {
-        "common/minmax_layer.cl",
-#include "./cl_kernels/common/minmax_layer.clembed"
-    },
-    {
-        "common/nonmax.cl",
-#include "./cl_kernels/common/nonmax.clembed"
-    },
-    {
-        "common/batchnormalization_layer.cl",
-#include "./cl_kernels/common/batchnormalization_layer.clembed"
-    },
-    {
-        "common/pad_layer.cl",
-#include "./cl_kernels/common/pad_layer.clembed"
-    },
-    {
-        "common/permute.cl",
-#include "./cl_kernels/common/permute.clembed"
-    },
-    {
-        "common/pixelwise_mul_float.cl",
-#include "./cl_kernels/common/pixelwise_mul_float.clembed"
-    },
-    {
-        "common/pixelwise_mul_int.cl",
-#include "./cl_kernels/common/pixelwise_mul_int.clembed"
-    },
-    {
-        "common/pooling_layer.cl",
-#include "./cl_kernels/common/pooling_layer.clembed"
-    },
-    {
-        "common/qlstm_layer_normalization.cl",
-#include "./cl_kernels/common/qlstm_layer_normalization.clembed"
-    },
-    {
-        "common/quantization_layer.cl",
-#include "./cl_kernels/common/quantization_layer.clembed"
-    },
-    {
-        "common/range.cl",
-#include "./cl_kernels/common/range.clembed"
-    },
-    {
-        "common/reduction_operation.cl",
-#include "./cl_kernels/common/reduction_operation.clembed"
-    },
-    {
-        "common/reshape_layer.cl",
-#include "./cl_kernels/common/reshape_layer.clembed"
-    },
-    {
-        "common/reverse.cl",
-#include "./cl_kernels/common/reverse.clembed"
-    },
-    {
-        "common/roi_align_layer.cl",
-#include "./cl_kernels/common/roi_align_layer.clembed"
-    },
-    {
-        "common/roi_align_layer_quantized.cl",
-#include "./cl_kernels/common/roi_align_layer_quantized.clembed"
-    },
-    {
-        "common/roi_pooling_layer.cl",
-#include "./cl_kernels/common/roi_pooling_layer.clembed"
-    },
-    {
-        "common/select.cl",
-#include "./cl_kernels/common/select.clembed"
-    },
-    {
-        "common/softmax_layer.cl",
-#include "./cl_kernels/common/softmax_layer.clembed"
-    },
-    {
-        "common/softmax_layer_quantized.cl",
-#include "./cl_kernels/common/softmax_layer_quantized.clembed"
-    },
-    {
-        "common/slice_ops.cl",
-#include "./cl_kernels/common/slice_ops.clembed"
-    },
-    {
-        "common/stack_layer.cl",
-#include "./cl_kernels/common/stack_layer.clembed"
-    },
-    {
-        "common/tile.cl",
-#include "./cl_kernels/common/tile.clembed"
-    },
-    {
-        "common/transpose.cl",
-#include "./cl_kernels/common/transpose.clembed"
-    },
-    {
-        "types.h",
-#include "./cl_kernels/types.hembed"
-    },
-    {
-        "common/unpooling_layer.cl",
-#include "./cl_kernels/common/unpooling_layer.clembed"
-    },
-#ifdef ENABLE_NCHW_KERNELS
-    {
-        "nchw/batch_to_space.cl",
-#include "./cl_kernels/nchw/batch_to_space.clembed"
-    },
-    {
-        "nchw/channel_shuffle.cl",
-#include "./cl_kernels/nchw/channel_shuffle.clembed"
-    },
-    {
-        "nchw/upsample_layer.cl",
-#include "./cl_kernels/nchw/upsample_layer.clembed"
-    },
-    {
-        "nchw/depth_to_space.cl",
-#include "./cl_kernels/nchw/depth_to_space.clembed"
-    },
-    {
-        "nchw/dequantization_layer.cl",
-#include "./cl_kernels/nchw/dequantization_layer.clembed"
-    },
-    {
-        "nchw/direct_convolution1x1.cl",
-#include "./cl_kernels/nchw/direct_convolution1x1.clembed"
-    },
-    {
-        "nchw/direct_convolution3x3.cl",
-#include "./cl_kernels/nchw/direct_convolution3x3.clembed"
-    },
-    {
-        "nchw/direct_convolution5x5.cl",
-#include "./cl_kernels/nchw/direct_convolution5x5.clembed"
-    },
-    {
-        "nchw/direct_convolution_quantized.cl",
-#include "./cl_kernels/nchw/direct_convolution_quantized.clembed"
-    },
-    {
-        "nchw/im2col.cl",
-#include "./cl_kernels/nchw/im2col.clembed"
-    },
-    {
-        "nchw/normalization_layer.cl",
-#include "./cl_kernels/nchw/normalization_layer.clembed"
-    },
-    {
-        "nchw/normalize_planar_yuv_layer.cl",
-#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed"
-    },
-    {
-        "nchw/normalize_planar_yuv_layer_quantized.cl",
-#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed"
-    },
-    {
-        "nchw/batchnormalization_layer.cl",
-#include "./cl_kernels/nchw/batchnormalization_layer.clembed"
-    },
-    {
-        "nchw/pooling_layer.cl",
-#include "./cl_kernels/nchw/pooling_layer.clembed"
-    },
-    {
-        "nchw/pooling_layer_quantized.cl",
-#include "./cl_kernels/nchw/pooling_layer_quantized.clembed"
-    },
-    {
-        "nchw/prior_box_layer.cl",
-#include "./cl_kernels/nchw/prior_box_layer.clembed"
-    },
-    {
-        "nchw/remap.cl",
-#include "./cl_kernels/nchw/remap.clembed"
-    },
-    {
-        "nchw/reorg_layer.cl",
-#include "./cl_kernels/nchw/reorg_layer.clembed"
-    },
-    {
-        "nchw/scale.cl",
-#include "./cl_kernels/nchw/scale.clembed"
-    },
-    {
-        "nchw/space_to_batch.cl",
-#include "./cl_kernels/nchw/space_to_batch.clembed"
-    },
-    {
-        "nchw/space_to_depth.cl",
-#include "./cl_kernels/nchw/space_to_depth.clembed"
-    },
-    {
-        "nchw/winograd_filter_transform.cl",
-#include "./cl_kernels/nchw/winograd_filter_transform.clembed"
-    },
-    {
-        "nchw/winograd_input_transform.cl",
-#include "./cl_kernels/nchw/winograd_input_transform.clembed"
-    },
-    {
-        "nchw/winograd_output_transform.cl",
-#include "./cl_kernels/nchw/winograd_output_transform.clembed"
-    },
-#endif /* ENABLE_NCHW_KERNELS */
-
-#ifdef ENABLE_NHWC_KERNELS
-    {
-        "nhwc/batch_to_space.cl",
-#include "./cl_kernels/nhwc/batch_to_space.clembed"
-    },
-    {
-        "nhwc/channel_shuffle.cl",
-#include "./cl_kernels/nhwc/channel_shuffle.clembed"
-    },
-    {
-        "nhwc/upsample_layer.cl",
-#include "./cl_kernels/nhwc/upsample_layer.clembed"
-    },
-    {
-        "nhwc/depth_to_space.cl",
-#include "./cl_kernels/nhwc/depth_to_space.clembed"
-    },
-    {
-        "nhwc/dequantization_layer.cl",
-#include "./cl_kernels/nhwc/dequantization_layer.clembed"
-    },
-    {
-        "nhwc/direct_convolution.cl",
-#include "./cl_kernels/nhwc/direct_convolution.clembed"
-    },
-    {
-        "nhwc/dwc_native_fp_nhwc.cl",
-#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed"
-    },
-    {
-        "nhwc/dwc_native_quantized_nhwc.cl",
-#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed"
-    },
-    {
-        "nhwc/normalization_layer.cl",
-#include "./cl_kernels/nhwc/normalization_layer.clembed"
-    },
-    {
-        "nhwc/normalize_planar_yuv_layer.cl",
-#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed"
-    },
-    {
-        "nhwc/normalize_planar_yuv_layer_quantized.cl",
-#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed"
-    },
-    {
-        "nhwc/im2col.cl",
-#include "./cl_kernels/nhwc/im2col.clembed"
-    },
-    {
-        "nhwc/batchnormalization_layer.cl",
-#include "./cl_kernels/nhwc/batchnormalization_layer.clembed"
-    },
-    {
-        "nhwc/pooling_layer.cl",
-#include "./cl_kernels/nhwc/pooling_layer.clembed"
-    },
-    {
-        "nhwc/pooling_layer_quantized.cl",
-#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
-    },
-    {
-        "nhwc/remap.cl",
-#include "./cl_kernels/nhwc/remap.clembed"
-    },
-    {
-        "nhwc/reorg_layer.cl",
-#include "./cl_kernels/nhwc/reorg_layer.clembed"
-    },
-    {
-        "nhwc/scale.cl",
-#include "./cl_kernels/nhwc/scale.clembed"
-    },
-    {
-        "nhwc/space_to_batch.cl",
-#include "./cl_kernels/nhwc/space_to_batch.clembed"
-    },
-    {
-        "nhwc/space_to_depth.cl",
-#include "./cl_kernels/nhwc/space_to_depth.clembed"
-    },
-    {
-        "nhwc/winograd_filter_transform.cl",
-#include "./cl_kernels/nhwc/winograd_filter_transform.clembed"
-    },
-    {
-        "nhwc/winograd_input_transform.cl",
-#include "./cl_kernels/nhwc/winograd_input_transform.clembed"
-    },
-    {
-        "nhwc/winograd_output_transform.cl",
-#include "./cl_kernels/nhwc/winograd_output_transform.clembed"
-    },
-#endif /* ENABLE_NHWC_KERNELS */
-#endif /* EMBEDDED_KERNELS */
-};
-
-ClKernelLibrary &ClKernelLibrary::get()
-{
-    static ClKernelLibrary _kernel_library;
-    return _kernel_library;
-}
-
-std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
-{
-    // Find which program contains the kernel
-    auto kernel_program_it = _kernel_program_map.find(kernel_name);
-
-    if(_kernel_program_map.end() == kernel_program_it)
-    {
-        ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
-    }
-
-    const std::string program_name = kernel_program_it->second;
-
-    return program_name;
-}
-
-void ClKernelLibrary::set_kernel_path(std::string kernel_path)
-{
-    _kernel_path = std::move(kernel_path);
-    _kernel_path += "/";
-}
-
-const std::string &ClKernelLibrary::kernel_path() const
-{
-    return _kernel_path;
-}
-
-ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const
-{
-#ifdef EMBEDDED_KERNELS
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-    const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
-    {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
-    }
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-    const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
-    {
-        ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
-    }
-    std::string program_source = program_source_it->second;
-
-#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
-    std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second));
-    ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program");
-    _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source));
-    program_source = std::move(decompressed_program_source);
-#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
-
-    return ClProgramInfo{ program_source, false };
-#else  /* EMBEDDED_KERNELS */
-    // Check for binary
-    std::string source_name = _kernel_path + program_name;
-    std::string binary_name = source_name + "bin";
-    std::string program_source{};
-    bool        is_binary = false;
-
-    if(std::ifstream(binary_name).is_open())
-    {
-        program_source = read_file(binary_name, true);
-        is_binary      = true;
-    }
-    else if(std::ifstream(source_name).is_open())
-    {
-        program_source = read_file(source_name, false);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
-    }
-
-    return ClProgramInfo{ program_source, is_binary };
-#endif /* EMBEDDED_KERNELS */
-}
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/ClKernelLibrary.h b/src/core/gpu/cl/ClKernelLibrary.h
deleted file mode 100644
index 42bec95032..0000000000
--- a/src/core/gpu/cl/ClKernelLibrary.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_KERNEL_LIBRARY_H
-#define ARM_COMPUTE_CL_KERNEL_LIBRARY_H
-
-#include <map>
-#include <string>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace opencl
-{
-/** ClKernelLibrary contains all the OpenCL kernels that are used throughout the library
- *
- * @note Kernel library is a singleton to reduce memory requirements
- * @note Sole responsibility is just to provide access to the kernel string,
- *       does not perform any compilation and relevant tasks
- */
-class ClKernelLibrary final
-{
-private:
-    /** Default Constructor */
-    ClKernelLibrary() = default;
-    /** Prevent instances of this class from being copied */
-    ClKernelLibrary(const ClKernelLibrary &) = delete;
-    /** Prevent instances of this class from being copied */
-    const ClKernelLibrary &operator=(const ClKernelLibrary &) = delete;
-
-public:
-    /** Structure to encapsulte program related information */
-    struct ClProgramInfo
-    {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
-    };
-
-public:
-    /** Access the KernelLibrary singleton
-     *
-     * @return The KernelLibrary instance
-     */
-    static ClKernelLibrary &get();
-    /** Sets the path that the kernels reside in
-     *
-     * @param[in] kernel_path Path of the kernel
-     */
-    void set_kernel_path(std::string kernel_path);
-    /** Gets the path that the kernels reside in
-     */
-    const std::string &kernel_path() const;
-    /** Gets the source of the selected program
-     *
-     * @param[in] program_name Program name
-     *
-     * @return A pair with the source (false) or the binary (true), of the selected program
-     */
-    ClProgramInfo program(const std::string &program_name) const;
-    /** Returns the program name given a kernel name
-     *
-     * @return Program name
-     */
-    std::string program_name(const std::string &kernel_name) const;
-
-private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
-                                                                                     Used for compile-time kernel inclusion. >*/
-};
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_KERNEL_LIBRARY_H */
diff --git a/src/core/gpu/cl/IClKernel.h b/src/core/gpu/cl/IClKernel.h
deleted file mode 100644
index 52ea3c9183..0000000000
--- a/src/core/gpu/cl/IClKernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_KERNEL_H
-#define ARM_COMPUTE_ICL_KERNEL_H
-
-#include "arm_compute/core/ITensorInfo.h"
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-using IClKernel = arm_compute::ICLKernel;
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.cpp b/src/core/gpu/cl/kernels/ClActivationKernel.cpp
deleted file mode 100644
index 21c05632f9..0000000000
--- a/src/core/gpu/cl/kernels/ClActivationKernel.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClActivationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-#include <set>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
-
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-        ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-    };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClActivationKernel::ClActivationKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _run_in_place = (dst == nullptr) || (dst == src);
-
-    if(dst != nullptr)
-    {
-        // Destination auto inizialitation if not yet initialized
-        auto_init_if_empty(*dst, *src->clone());
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    const DataType dt      = src->data_type();
-    float          a_const = act_info.a();
-    float          b_const = act_info.b();
-
-    const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
-    const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
-    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-
-    std::string kernel_name = std::string("activation_layer");
-
-    // Set quantization info build options
-    if(is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-
-        if(!perform_activation_in_float)
-        {
-            int a_const_int = 0;
-            int b_const_int = 0;
-
-            // Create quantized version of constants a, b if needed
-            switch(dt)
-            {
-                case DataType::QASYMM8:
-                {
-                    a_const_int = quantize_qasymm8(a_const, iq_info);
-                    b_const_int = quantize_qasymm8(b_const, iq_info);
-                }
-                break;
-                case DataType::QASYMM8_SIGNED:
-                {
-                    a_const_int = quantize_qasymm8_signed(a_const, iq_info);
-                    b_const_int = quantize_qasymm8_signed(b_const, iq_info);
-                }
-                break;
-                case DataType::QSYMM16:
-                {
-                    a_const_int = quantize_qsymm16(a_const, iq_info);
-                    b_const_int = quantize_qsymm16(b_const, iq_info);
-                }
-                break;
-                default:
-                    break;
-            }
-            build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
-            build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
-        }
-        else
-        {
-            build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-            build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-        }
-
-        // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
-        build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
-
-        // Set correct kernel name
-        kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
-
-        // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
-        {
-            const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-            if(iq_info != oq_info)
-            {
-                build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
-            }
-        }
-    }
-    else
-    {
-        // Set A, B constants in build options for float types
-        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
-        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "activation_layer_";
-    _config_id += lower_string(string_from_data_type(dt));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    return Status{};
-}
-
-void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
-        {
-            add_3D_tensor_argument(idx, dst, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.h b/src/core/gpu/cl/kernels/ClActivationKernel.h
deleted file mode 100644
index 720b16a691..0000000000
--- a/src/core/gpu/cl/kernels/ClActivationKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the activation kernel. */
-class ClActivationKernel : public IClKernel
-{
-public:
-    ClActivationKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
-     * @param[in]      act_info        Activation layer information.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClActivationKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    bool _run_in_place{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
deleted file mode 100644
index fba1b0e087..0000000000
--- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClBatchConcatenateKernel::ClBatchConcatenateKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _batch_offset = batch_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_";
-    _config_id += support::cpp11::to_string(3);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(batch_offset);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(3));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
-                                          unsigned int                    batch_offset,
-                                          const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
-    return Status{};
-}
-
-void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace opencl
-} // namespace kernels
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
deleted file mode 100644
index 2963d7cdfd..0000000000
--- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the batch concatenate kernel.
- *  The src tensor will be concatenated into the destination tensor.
- */
-class ClBatchConcatenateKernel : public IClKernel
-{
-public:
-    ClBatchConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
-     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClBatchConcatenateKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _batch_offset{ 0 };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.cpp b/src/core/gpu/cl/kernels/ClCastKernel.cpp
deleted file mode 100644
index fac9ebe5cf..0000000000
--- a/src/core/gpu/cl/kernels/ClCastKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClCastKernel::ClCastKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
-    set_shape_if_empty(*dst, src->tensor_shape());
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Get data sizes
-    const size_t src_size = data_size_from_type(src->data_type());
-    const size_t dst_size = data_size_from_type(dst->data_type());
-
-    // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
-    build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
-
-    // Create kernel
-    const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Collapse window
-    const Window &full_window      = window();
-    Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
-    ICLKernel::configure_internal(collapsed_window);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-}
-
-Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
-    return Status{};
-}
-
-void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.h b/src/core/gpu/cl/kernels/ClCastKernel.h
deleted file mode 100644
index 6bf3cd9e50..0000000000
--- a/src/core/gpu/cl/kernels/ClCastKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H
-#define ARM_COMPUTE_CL_CAST_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Casts a given tensor to a new type
- *
- * @note When casting between quantized types the scale and zeroPoint are ignored
- */
-class ClCastKernel : public IClKernel
-{
-public:
-    ClCastKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * Valid conversions src -> dst :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCastKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp
deleted file mode 100644
index a3d57115f9..0000000000
--- a/src/core/gpu/cl/kernels/ClCol2ImKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCol2ImKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
-    // Checks performed when output is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
-
-    constexpr unsigned int num_elems_read_per_iteration = 8;
-
-    // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_read_per_iteration));
-
-    // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
-    AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ClCol2ImKernel::ClCol2ImKernel()
-    : _convolved_dims()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims, num_groups));
-
-    _convolved_dims = convolved_dims;
-
-    const DataType data_type = src->data_type();
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
-    build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(src->dimension(0)));
-    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-
-    _kernel = create_kernel(compile_context, "col2im", build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, _convolved_dims, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "col2im_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-}
-
-Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
-    return Status{};
-}
-
-void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
-
-    bool is_collapsed     = false;
-    bool is_collapsed_out = false;
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window out_window;
-    out_window.use_tensor_dimensions(dst->info()->tensor_shape());
-
-    Window collapsed     = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
-    Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
-
-    ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
-
-    Window slice     = collapsed.first_slice_window_3D();
-    Window slice_out = collapsed_out.first_slice_window_4D();
-    do
-    {
-        // Set inputs
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCol2ImKernel.h b/src/core/gpu/cl/kernels/ClCol2ImKernel.h
deleted file mode 100644
index 74a9027628..0000000000
--- a/src/core/gpu/cl/kernels/ClCol2ImKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COL2IM_KERNEL_H
-#define ARM_COMPUTE_CL_COL2IM_KERNEL_H
-
-#include "arm_compute/core/Size2D.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref opencl::kernels::ClIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class ClCol2ImKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClCol2ImKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCol2ImKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The input tensor info to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] dst             The output tensor info. 3 lower dimensions represent a single output [width, height, OFM],
-     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims  Output convolved dimensions.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClCol2ImKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    Size2D _convolved_dims;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_COL2IM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index d1abd274d6..0000000000
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialisation if not yet initialized
-    auto_init_if_empty(*dst, *src->clone());
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
-
-    const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
-    const int width_idx   = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH);
-    const int height_idx  = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT);
-    const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL);
-
-    const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx];
-    const unsigned int num_channels            = original_src_shape[channel_idx];
-
-    const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels;
-    const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane;
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
-    build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, src, window);
-    add_2D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index 3976fd45db..0000000000
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
-#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-namespace opencl
-{
-namespace kernels
-{
-class ClConvertFullyConnectedWeightsKernel : public IClKernel
-{
-public:
-    ClConvertFullyConnectedWeightsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel);
-    /** Set the src and dst tensor.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  src                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] dst                The converted weights tensor info. Shape and Data Type: Same as @p src.
-     * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
-     * @param[in]  data_layout        The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.cpp b/src/core/gpu/cl/kernels/ClCopyKernel.cpp
deleted file mode 100644
index 98c6f34e60..0000000000
--- a/src/core/gpu/cl/kernels/ClCopyKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-    // Validate dst if initialized
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape());
-        }
-    }
-
-    return Status{};
-}
-
-} // namespace
-
-ClCopyKernel::ClCopyKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, *src);
-
-    // Configure window
-    const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
-
-    if(dst_window != nullptr)
-    {
-        _has_dst_window                = true;
-        _dst_window                    = Window(*dst_window);
-        const int  width_x             = dst_window->num_iterations(0);
-        const int  vec_size_x_leftover = width_x % vec_size_x;
-        const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
-
-        if(multi_access_x)
-        {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
-        }
-
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
-    }
-    else
-    {
-        const int width_x             = src->tensor_shape().x();
-        const int vec_size_x_leftover = width_x % vec_size_x;
-
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-
-    // Build kernel
-    _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
-
-    // Validate and set the window
-    ICLKernel::configure_internal(win_config);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
-
-    return Status{};
-}
-
-void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice;
-
-    if(_has_dst_window)
-    {
-        slice            = window.first_slice_window_3D();
-        Window out_slice = _dst_window.first_slice_window_3D();
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, out_slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
-    }
-    else
-    {
-        Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-        slice            = collapsed.first_slice_window_3D();
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.h b/src/core/gpu/cl/kernels/ClCopyKernel.h
deleted file mode 100644
index d2732c4e59..0000000000
--- a/src/core/gpu/cl/kernels/ClCopyKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H
-#define ARM_COMPUTE_CL_COPY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a copy between two tensors */
-class ClCopyKernel : public IClKernel
-{
-public:
-    ClCopyKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel);
-    /** Initialize the kernel's src, dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: All.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCopyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Window _dst_window{};
-    bool   _has_dst_window{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_COPY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.cpp b/src/core/gpu/cl/kernels/ClCropKernel.cpp
deleted file mode 100644
index a052ef53f9..0000000000
--- a/src/core/gpu/cl/kernels/ClCropKernel.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClCropKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-ClCropKernel::ClCropKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
-
-    _start               = start;
-    _batch_index         = batch_index;
-    _extrapolation_value = extrapolation_value;
-
-    const int vec_size_x = 4;
-    // Create and update the window (if needed)
-    Window win = calculate_max_window(*dst);
-
-    if(dst_window != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
-        win = *dst_window;
-    }
-
-    const int  dst_width_x    = win.num_iterations(0);
-    const bool multi_access_x = dst_width_x >= vec_size_x;
-    const bool remainder_x    = dst_width_x % vec_size_x > 0;
-
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
-    build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
-    _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
-}
-
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
-{
-    ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
-    ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
-    }
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3);
-    }
-    return Status{};
-}
-
-void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window in_slice = Window();
-    in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
-    in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
-
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, src, in_slice);
-    add_3D_tensor_argument(idx, dst, window);
-    add_argument(idx, _start.x);
-    add_argument(idx, _start.y);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.h b/src/core/gpu/cl/kernels/ClCropKernel.h
deleted file mode 100644
index d81912284e..0000000000
--- a/src/core/gpu/cl/kernels/ClCropKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_CROP_KERNEL_H
-#define ARM_COMPUTE_CL_CROP_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a copy between two tensors */
-class ClCropKernel : public IClKernel
-{
-public:
-    ClCropKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
-     * @param[out] dst                 Destination tensor info. Data type supported: F32
-     * @param[in]  start               Coordinates of where to start cropping the image.
-     * @param[in]  end                 Coordinates of where to end cropping the image.
-     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClCropKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Coordinates2D _start{};
-    uint32_t      _batch_index{};
-    float         _extrapolation_value{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CROP_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
deleted file mode 100644
index e3e384f748..0000000000
--- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _depth_offset = depth_offset;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
-                                          unsigned int                    depth_offset,
-                                          const arm_compute::ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
-    return Status{};
-}
-
-void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_3D();
-
-    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
deleted file mode 100644
index 0f408477b1..0000000000
--- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the depth concatenate kernel.
- *  The src tensor will be concatenated into the dst tensor.
- */
-class ClDepthConcatenateKernel : public IClKernel
-{
-public:
-    ClDepthConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
-     *
-     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
-     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClDepthConcatenateKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _depth_offset;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
deleted file mode 100644
index d69da8716c..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDequantizeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
-    if(dst->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClDequantizeKernel::ClDequantizeKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / dst->element_size();
-    const int  output_width_x = dst->tensor_shape().x();
-    const bool multi_access_x = (output_width_x / vec_size_x > 0);
-
-    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
-    std::string kernel_name              = "dequantization_layer";
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
-    {
-        const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
-        const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
-        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
-    }
-    else
-    {
-        kernel_name += "_per_channel";
-        kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
-    }
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-
-    // Create kernel name
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
-
-    // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice      = new_window.first_slice_window_3D();
-
-    if(is_quantized_per_channel)
-    {
-        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
-        _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.h b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
deleted file mode 100644
index 0912e1b228..0000000000
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class ClDequantizeKernel : public IClKernel
-{
-public:
-    ClDequantizeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] dst             Destination tensor info. Data types supported: F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClDequantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
deleted file mode 100644
index 7b98671da2..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClDirectConv2dKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
-    const DataLayout data_layout = src->data_layout();
-    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
-                                    "Weights feature map dimension should match the respective src's one");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
-                                    && std::get<0>(conv_info.stride()) > 2,
-                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout != DataLayout::NHWC && !is_data_type_float(src->data_type()) && act_info.enabled(),
-                                    "Activation supported only for floating point and NHWC.");
-
-    if(data_layout == DataLayout::NCHW)
-    {
-        if(is_data_type_quantized(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
-        }
-    }
-
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
-                                        "Biases size and number of src feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
-    {
-        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-    }
-    return Status{};
-}
-
-inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
-                                                      DataType data_type, DataLayout data_layout)
-{
-    return gpu_target_is_in(gpu_target,
-                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                            GPUTarget::G52, GPUTarget::G52LIT)
-           && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1)
-           && (data_type == DataType::F32)
-           && (data_layout == DataLayout::NCHW);
-}
-
-inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
-                                 unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
-                                 unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src)
-{
-    const DataType   data_type     = src->data_type();
-    const DataLayout data_layout   = src->data_layout();
-    unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
-    unsigned int     conv_stride_y = std::get<1>(conv_info.stride());
-
-    const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
-
-    if(run_optimized_bifrost)
-    {
-        // Configure kernel window
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-                break;
-            }
-            case 3:
-            {
-                num_elems_read_per_iteration_x    = 6;
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-                break;
-            }
-            case 5:
-            {
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 6;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
-    }
-    else
-    {
-        num_elems_read_per_iteration_y    = kernel_size;
-        num_elems_written_per_iteration_x = 8;
-        num_elems_written_per_iteration_y = 1;
-        switch(kernel_size)
-        {
-            case 1:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 8;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 3:
-                        switch(src->element_size())
-                        {
-                            case 1:
-                                num_elems_read_per_iteration_x = 28;
-                                break;
-                            case 2:
-                                num_elems_read_per_iteration_x = 24;
-                                break;
-                            case 4:
-                                num_elems_read_per_iteration_x = 22;
-                                break;
-                            default:
-                                ARM_COMPUTE_ERROR("Invalid data size");
-                        }
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 3:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 10;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 17;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 5:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 12;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 20;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            case 9:
-                switch(conv_stride_x)
-                {
-                    case 1:
-                        num_elems_read_per_iteration_x = 16;
-                        break;
-                    case 2:
-                        num_elems_read_per_iteration_x = 24;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid direct convolution size");
-        }
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target)
-{
-    const DataLayout data_layout = src->data_layout();
-
-    // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
-        unsigned int       num_rows = 1U;
-        if(dst->tensor_shape()[0] > 16)
-        {
-            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
-        }
-
-        // Create window and update padding
-        Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
-        return std::make_pair(Status{}, win);
-    }
-    else if(data_layout == DataLayout::NCHW)
-    {
-        const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const unsigned int kernel_size = weights->dimension(width_idx);
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
-
-        unsigned int conv_pad_left = conv_info.pad_left();
-        unsigned int conv_pad_top  = conv_info.pad_top();
-        unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-        unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-
-        setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                             num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
-                             kernel_size, conv_info, target, src);
-
-        // Create window and update padding
-        bool   window_changed = false;
-        Window win            = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
-        AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported");
-    }
-}
-
-bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
-{
-    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
-    {
-        return false;
-    }
-
-    // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
-    {
-        return false;
-    }
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
-    {
-        return false;
-    }
-
-    // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
-    {
-        return false;
-    }
-
-    const size_t image_w     = tensor->tensor_shape()[0] / 4;
-    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
-    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-    if(image_w > max_image_w || image_h > max_image_h)
-    {
-        return false;
-    }
-
-    return true;
-}
-
-} // namespace
-
-BorderSize ClDirectConv2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-ClDirectConv2dKernel::ClDirectConv2dKernel()
-{
-    _type = CLKernelType::DIRECT;
-}
-
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
-    // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info));
-
-    const int conv_stride_x = std::get<0>(conv_info.stride());
-    const int conv_stride_y = std::get<1>(conv_info.stride());
-
-    _data_layout = src->data_layout();
-    _conv_info   = conv_info;
-
-    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int kernel_size = weights->dimension(width_idx);
-    const DataType     data_type   = src->data_type();
-
-    const GPUTarget gpu_target = get_target();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    std::stringstream kernel_name;
-    CLBuildOptions    build_options;
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        _border_size = BorderSize();
-
-        kernel_name << "direct_convolution_nhwc";
-
-        const unsigned int n0                 = win_config.second.x().step();
-        const unsigned int m0                 = win_config.second.y().step();
-        const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
-        const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
-        const unsigned int pad_left           = conv_info.pad_left();
-        const unsigned int pad_top            = conv_info.pad_top();
-        const bool         export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
-
-        // Update the padding for the weights tensor if we can export to cl_image
-        if(export_to_cl_image)
-        {
-            gemm::update_padding_for_cl_image(weights);
-        }
-
-        if(biases != nullptr)
-        {
-            build_options.add_option(std::string("-DHAS_BIAS"));
-            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
-        }
-
-        build_options.add_option("-cl-fast-relaxed-math");
-        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
-        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
-        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
-        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
-        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
-        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
-        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
-        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
-        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
-        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
-        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
-        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
-        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
-        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
-        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
-        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-        build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-
-        if(is_data_type_quantized(data_type))
-        {
-            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-            PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
-            int        zero_value_s32;
-            zero_value.get(zero_value_s32);
-
-            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-            int   output_multiplier = 0;
-            int   output_shift      = 0;
-            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-            build_options.add_option("-DIS_QUANTIZED");
-            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        }
-        else
-        {
-            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
-            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
-            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        }
-    }
-    else
-    {
-        _border_size = BorderSize(src->padding());
-
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
-        build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
-
-        const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
-
-        if(run_optimized_for_bifrost)
-        {
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-
-            kernel_name << "_f32_bifrost";
-        }
-        else
-        {
-            build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
-            build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-            build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
-            build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
-            build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-
-            if(is_data_type_quantized(data_type))
-            {
-                const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-                const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-                const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-                float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
-                int   output_multiplier = 0;
-                int   output_shift      = 0;
-                quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-                build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-                build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-                build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
-                build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
-                build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
-                build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
-
-                kernel_name.str("direct_convolution_quantized");
-            }
-        }
-    }
-
-    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name.str();
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(kernel_size);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().left);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().top);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().right);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(border_size().bottom);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_x);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_stride_y);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first);
-
-    return Status{};
-}
-
-void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Get initial windows
-    Window slice = window.first_slice_window_3D();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        cl::Image2D weights_cl_image;
-
-        const size_t dim_y_collapsed    = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
-        const bool   export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout);
-
-        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
-        slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
-
-        if(export_to_cl_image)
-        {
-            const size_t      image_w = weights->info()->dimension(0) / 4;
-            const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
-            const TensorShape shape2d(image_w, image_h);
-            const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
-
-            // Export cl_buffer to cl_image
-            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
-        }
-
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        if(export_to_cl_image)
-        {
-            _kernel.setArg(idx++, weights_cl_image);
-        }
-        add_4D_tensor_argument(idx, weights, slice);
-        if(biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, biases, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window win_in = window;
-
-        win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
-        win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);
-
-        const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-        const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-        const int conv_stride_x = std::get<0>(_conv_info.stride());
-        const int conv_stride_y = std::get<1>(_conv_info.stride());
-
-        win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x);
-        win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y);
-
-        Window       slice_in = win_in.first_slice_window_3D();
-        unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
-        add_3D_tensor_argument(idx1, weights, slice);
-
-        if(biases != nullptr)
-        {
-            Window slice_biases;
-            slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
-            add_1D_tensor_argument(idx1, biases, slice_biases);
-        }
-
-        _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice_in);
-            add_3D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
deleted file mode 100644
index 4880d4a668..0000000000
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
-#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the  direct convolution kernel. */
-class ClDirectConv2dKernel : public IClKernel
-{
-public:
-    ClDirectConv2dKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
-    /** Set the src, weights, biases and dst tensors info.
-     *
-     * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the src's volume 3rd dimension.
-     *                             Data type supported:Same as @p src.
-     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] dst             Output tensor info.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClDirectConv2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    DataLayout    _data_layout{};
-    BorderSize    _border_size{};
-    PadStrideInfo _conv_info{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
deleted file mode 100644
index 3d9f0b6fcf..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClElementwiseKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/common/utils/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-#include <map>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
-};
-
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-};
-
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    std::string config_id;
-    // Set config_id for enabling LWS tuning
-    config_id = kernel_name;
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(src1.data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(dst.dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(dst.dimension(1));
-    return config_id;
-}
-
-Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
-{
-    if(in_place)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
-                                        "Wrong shape for dst, cannot do in_place calculation");
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
-                                        "Wrong shape for dst");
-    }
-    return Status{};
-}
-
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
-    const bool src1_in_place = in_place && (&src1 == &dst);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (src1 == dst) || (src2 == dst);
-    const bool src1_in_place = in_place && (src1 == dst);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
-
-    if(is_data_type_quantized_symmetric(src1.data_type()))
-    {
-        const int32_t in1_offset = src1.quantization_info().uniform().offset;
-        const int32_t in2_offset = src2.quantization_info().uniform().offset;
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
-    }
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
-    const bool src1_in_place = in_place && (&src1 == &dst);
-
-    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
-
-        if(is_data_type_quantized_symmetric(dst.data_type()))
-        {
-            const int32_t offset = dst.quantization_info().uniform().offset;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
-        }
-    }
-    return Status{};
-}
-
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
-{
-    CLBuildOptions build_opts;
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
-    {
-        const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
-        const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo  = dst.quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset));
-        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
-    }
-    build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32");
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
-    const bool src1_in_place = in_place && (&src1 == &dst);
-    build_opts.add_option_if(in_place, "-DIN_PLACE");
-    build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
-
-    return build_opts;
-}
-
-std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
-{
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
-    return std::make_pair(Status{}, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-
-    auto_init_if_empty(dst, out_shape, 1, src1.data_type());
-
-    return configure_window_arithmetic_common(dst);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-
-    set_shape_if_empty(dst, out_shape);
-    set_data_type_if_unknown(dst, DataType::U8);
-
-    return configure_window_arithmetic_common(dst);
-}
-
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
-    const TensorShape &out_shape = broadcast_pair.first;
-
-    auto_init_if_empty(dst, out_shape, 1, src1.data_type());
-
-    return configure_window_arithmetic_common(dst);
-}
-} // namespace
-
-ClElementwiseKernel::ClElementwiseKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*src1, *src2, *dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
-    {
-        kernel_name += "_quantized";
-    }
-
-    // Set kernel build options
-    CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
-    {
-        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
-        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
-        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    ICLKernel::configure_internal(win_config.second);
-
-    _config_id = generate_id_for_tuning(kernel_name, *src1, *dst);
-}
-
-void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool       can_collapse = true;
-    const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice      = collapsed.first_slice_window_3D();
-    Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    // Check whether it is in_place calculation
-    const bool in_place = (src_0 == dst) || (src_1 == dst);
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_src1);
-        add_3D_tensor_argument(idx, src_1, slice_src2);
-        if(!in_place)
-        {
-            add_3D_tensor_argument(idx, dst, slice);
-        }
-
-        enqueue(queue, *this, slice, lws_hint());
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-/** Logical binary */
-
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
-    _op = op;
-    configure_common(compile_context, src1, src2, dst);
-}
-
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
-
-    return Status{};
-}
-
-std::string ClLogicalBinaryKernel::name()
-{
-    switch(_op)
-    {
-        case LogicalOperation::And:
-            return "AND";
-        case LogicalOperation::Or:
-            return "OR";
-        case LogicalOperation::Not:
-        /* fall through */
-        default:
-            ARM_COMPUTE_ASSERT(true);
-    }
-    return "";
-}
-
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
-}
-
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    // The arithmetic utility functions can be share
-    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
-}
-
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    return generate_id_for_tuning_common(kernel_name, src1, dst);
-}
-
-/** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
-                                            const ConvertPolicy       &policy,
-                                            const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
-
-    _policy   = policy;
-    _op       = op;
-    _act_info = act_info;
-    configure_common(compile_context, input1, input2, output);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                                             const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(op, policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
-
-    return Status{};
-}
-
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
-{
-    return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
-}
-
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
-    const bool has_float_out = is_data_type_float(output.data_type());
-    auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
-    build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-    return build_options;
-}
-
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
-{
-    auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
-    config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
-    config_id += lower_string(string_from_data_layout(input1.data_layout()));
-    return config_id;
-}
-
-std::string ClSaturatedArithmeticKernel::name()
-{
-    return supported_sat_arithmetic_ops[_op];
-}
-
-/** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                                   const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    _op       = op;
-    _act_info = act_info;
-    configure_common(compile_context, src1, src2, dst);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
-    {
-        // Partial integer support S32/F32/F16
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    else if(op == ArithmeticOperation::POWER)
-    {
-        // Power operators doesn't support integer arithmetic
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    return Status{};
-}
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
-{
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
-    {
-        // Division and Power operators don't support integer arithmetic
-        return validate_and_configure_window_for_division(src1, src2, dst);
-    }
-    else
-    {
-        return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst);
-    }
-}
-
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
-{
-    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
-}
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
-{
-    return generate_id_for_tuning_common(kernel_name, src1, dst);
-}
-
-std::string ClArithmeticKernel::name()
-{
-    return supported_arithmetic_ops[_op];
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
deleted file mode 100644
index 4525cec55b..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
-
-#include "src/core/KernelTypes.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f]
- *
- * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
- *
- */
-class ClElementwiseKernel : public IClKernel
-{
-public:
-    ClElementwiseKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-protected:
-    /** The name of the operation */
-    virtual std::string name() = 0;
-
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
-     *
-     * @return a pair of Status and Window
-     */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
-
-    /** Generate the build options for the specific kernel
-     *
-     * @reutrn a CLBuildOptions struct
-     */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
-
-    /** Generate the identifier for tuning
-     *
-     * @reutrn a string
-     */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-
-    ActivationLayerInfo _act_info{};
-};
-
-class ClLogicalBinaryKernel : public ClElementwiseKernel
-{
-public:
-    ClLogicalBinaryKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel);
-    /** Function to configure kernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Logical binary operation to be executed.
-     * @param[in] src1            First source tensor info. Data types supported: U8.
-     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
-     */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogicalBinaryKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-private:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-    LogicalOperation _op{ LogicalOperation::Unknown };
-};
-
-/** Addition operation */
-class ClSaturatedArithmeticKernel : public ClElementwiseKernel
-{
-public:
-    ClSaturatedArithmeticKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel);
-    /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] policy          Policy to use to handle overflow.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClSaturatedArithmeticKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ConvertPolicy       _policy{};
-    ArithmeticOperation _op{};
-};
-
-class ClArithmeticKernel : public ClElementwiseKernel
-{
-public:
-    ClArithmeticKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
-     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
-     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClArithmeticKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-private:
-    ArithmeticOperation _op{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
deleted file mode 100644
index 1525c0fe54..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
-    }
-    else if(op == ElementWiseUnary::NEG)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
-    }
-
-    // Validate in case of configured dst
-    if(dst.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-
-    const std::string kernel_name    = "elementwise_unary";
-    const int         vec_size_x     = 16 / dst->element_size();
-    const int         dst_width_x    = dst->tensor_shape().x();
-    const bool        multi_access_x = (dst_width_x / vec_size_x > 0);
-
-    // Set kernel build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            build_opts.add_option("-DOPERATION=rsqrt_op");
-            break;
-        case ElementWiseUnary::EXP:
-            build_opts.add_option("-DOPERATION=exp_op");
-            break;
-        case ElementWiseUnary::NEG:
-            build_opts.add_option("-DOPERATION=neg_op");
-            break;
-        case ElementWiseUnary::SIN:
-            build_opts.add_option("-DOPERATION=sin_op");
-            break;
-        case ElementWiseUnary::ABS:
-            build_opts.add_option("-DOPERATION=fabs_op");
-            break;
-        case ElementWiseUnary::LOG:
-            build_opts.add_option("-DOPERATION=natural_log_op");
-            break;
-        case ElementWiseUnary::ROUND:
-            build_opts.add_option("-DOPERATION=round_op");
-            break;
-        case ElementWiseUnary::LOGICAL_NOT:
-            build_opts.add_option("-DOPERATION=logical_not_op");
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op)
-{
-    ARM_COMPUTE_UNUSED(op);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op));
-
-    return Status{};
-}
-
-void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
deleted file mode 100644
index 64cc2f7afc..0000000000
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
-#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the elementwise unary operator */
-class ClElementWiseUnaryKernel : public IClKernel
-{
-public:
-    ClElementWiseUnaryKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel);
-    /** Initialise the kernel's srcs, dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             First source tensor info. Data types supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  op              Element wise unary operation to perform.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClElementWiseUnaryKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.cpp b/src/core/gpu/cl/kernels/ClFillKernel.cpp
deleted file mode 100644
index f213bf8e6a..0000000000
--- a/src/core/gpu/cl/kernels/ClFillKernel.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClFillKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-ClFillKernel::ClFillKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
-
-    const DataType data_type  = tensor->data_type();
-    const int      vec_size_x = 16 / tensor->element_size();
-
-    // Create and update the window (if needed)
-    _full_window = calculate_max_window(*tensor);
-    Window win   = _full_window;
-    if(window != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
-        win = *window;
-    }
-
-    const int  output_width_x = win.num_iterations(0);
-    const bool multi_access_x = output_width_x >= vec_size_x;
-    const bool remainder_x    = output_width_x % vec_size_x > 0;
-
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
-    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
-    _kernel = create_kernel(compile_context, "memset", build_opts.options());
-}
-
-Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
-{
-    ARM_COMPUTE_UNUSED(tensor);
-    ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
-    }
-    return Status{};
-}
-
-void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-
-    // Collapse all the batches on the third
-    Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, tensor, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.h b/src/core/gpu/cl/kernels/ClFillKernel.h
deleted file mode 100644
index ecc2546e4a..0000000000
--- a/src/core/gpu/cl/kernels/ClFillKernel.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H
-#define ARM_COMPUTE_CL_FILL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for filling the planes of a tensor */
-class ClFillKernel : public IClKernel
-{
-public:
-    ClFillKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel);
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Input tensor info. Supported data types: All.
-     * @param[in]     constant_value  The value used to fill the planes of the tensor
-     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClFillKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    Window _full_window{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FILL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.cpp b/src/core/gpu/cl/kernels/ClFloorKernel.cpp
deleted file mode 100644
index 2047128963..0000000000
--- a/src/core/gpu/cl/kernels/ClFloorKernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClFloorKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    // Validate in case of configured output
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClFloorKernel::ClFloorKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Auto initialize output
-    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
-
-    // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
-
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
-    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    CLBuildOptions     build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "floor_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(vec_size_x));
-    IClKernel::configure_internal(win);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.h b/src/core/gpu/cl/kernels/ClFloorKernel.h
deleted file mode 100644
index 57c9906f2c..0000000000
--- a/src/core/gpu/cl/kernels/ClFloorKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H
-#define ARM_COMPUTE_CL_FLOOR_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform a floor operation */
-class ClFloorKernel : public IClKernel
-{
-public:
-    ClFloorKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel);
-    /** Configure kernel for a given list of arguments
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data type supported: F16/F32.
-     * @param[out] dst             Destination tensor info. Same as @p src
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClFloorKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index ec0a3bf8e0..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
-    bool          reinterpret_dst_as_3d               = (gemm_info.depth_output_gemm3d() != 0);
-
-    Window win{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_dst_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
-    {
-        reinterpret_dst_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_dst_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // RHS matrix still needs padding on the X
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-
-    window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ src0, dst });
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    std::string kernel_name("gemmlowp_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index eaa125fbf2..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel
-{
-public:
-    ClGemmLowpMatrixMultiplyNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel);
-    /** Initialise the kernel's input and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  src1            Source tensor containing the RHS matrix. Data type supported: same as @p src0
-     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 44fda01ded..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m();
-    const int n = gemm_info.n();
-    const int k = gemm_info.k();
-
-    TensorShape tensor_shape0{ src0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
-
-    TensorInfo tmp_info(*dst);
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    return std::make_pair(Status{}, collapsed);
-}
-} // namespace
-
-ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-    _k                        = gemm_info.k();
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensionssrc0 = src0->num_dimensions();
-    _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
-
-    auto              padding_info = get_padding_info({ src0, src1, dst });
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 99cff011d1..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
- *
- * @note The input matrices @p src0 and @p src1 must be reshaped through:
- *  - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
- *  - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- */
-class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel
-{
-public:
-    ClGemmLowpMatrixMultiplyReshapedKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel);
-    /** Initialise the kernel's input and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  src1            Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    unsigned int _k{ 1 };
-    bool         _use_dummy_work_items{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
deleted file mode 100644
index 9d626936ff..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <tuple>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-using namespace misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
-
-    const int m = gemm_info.m;
-    const int n = gemm_info.n;
-    const int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-        }
-    }
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
-                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
-
-    // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
-        }
-
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-            // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
-
-            // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
-            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
-
-            if(expected_dst_shape.num_dimensions() > 1)
-            {
-                const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-                vector_sum_row_shape.collapse_from(1);
-                TensorShape collapsed_dst_shape(expected_dst_shape);
-                collapsed_dst_shape.collapse_from(dst_batch_idx);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
-                                                "vector_sum_row must have the same number of batches of dst tensor");
-
-                if(gemm_info.a_offset != 0)
-                {
-                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                    vector_sum_col_shape.collapse_from(1);
-
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-                }
-            }
-        }
-
-        if(dst->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-
-        if(output_multipliers != nullptr && output_shifts != nullptr)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
-                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
-            }
-        }
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
-{
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
-    {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
-    }
-    else
-    {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
-    }
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
-    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        if(gemm_info.a_offset != 0)
-        {
-            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
-        }
-        // No access window needed for vector_sum_row
-        ARM_COMPUTE_UNUSED(vector_sum_row);
-
-        if(bias != nullptr)
-        {
-            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
-        }
-
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
-    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
-    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
-    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
-    const int32_t                 a_offset     = gemm_info.a_offset;
-    const int32_t                 b_offset     = gemm_info.b_offset;
-
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
-
-    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-    {
-        kernel_name += "_fused_output_stage_fixedpoint";
-        _fuse_output_stage = true;
-        // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
-        {
-            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-            build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-        }
-        // If b_offset == 0, vector_sum_row can be a nullptr
-        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
-        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-
-        const int min = output_stage.gemmlowp_min_bound;
-        const int max = output_stage.gemmlowp_max_bound;
-
-        PixelValue min_val{};
-        PixelValue max_val{};
-        std::tie(min_val, max_val) = get_min_max(dst->data_type());
-        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
-    _config_id += "_";
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_reinterpret_output_as_3d)
-        {
-            // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-            idx++;
-        }
-
-        if(_fuse_output_stage)
-        {
-            add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
-            add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
-            add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
-            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
-        }
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
deleted file mode 100644
index 9e52b38249..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped
- *
- * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel
-{
-public:
-    ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel);
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  src0               Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  src1               Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
-     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _is_quantized_per_channel{ false };
-    bool _fuse_output_stage{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
deleted file mode 100644
index e491cca914..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                   int32_t k, int32_t a_offset, int32_t b_offset)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-    IClKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-    return Status{};
-}
-
-void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
-
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
-
-    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, mm_result, slice);
-        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
-
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
deleted file mode 100644
index d1712f4f4b..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class ClGemmLowpOffsetContributionKernel : public IClKernel
-{
-public:
-    ClGemmLowpOffsetContributionKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context,
-                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                   int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
deleted file mode 100644
index 1e2d7d7efe..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
-    }
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
-    }
-
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-
-        // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-        // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
-
-        TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
-
-            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(output_batch_idx);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
-                                            "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
-    // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
-
-    return Status{};
-}
-} // namespace
-
-ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
-                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
-
-    const int min = output_stage.gemmlowp_min_bound;
-    const int max = output_stage.gemmlowp_max_bound;
-
-    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
-
-    // Auto initialize the output
-    auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
-
-    // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
-    {
-        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
-        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
-    }
-    // If b_offset == 0, vector_sum_row can be a nullptr
-    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
-    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-
-    PixelValue min_val{};
-    PixelValue max_val{};
-    std::tie(min_val, max_val) = get_min_max(dst->data_type());
-    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-
-    std::string kernel_name("gemmlowp_offset_contribution");
-    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name + "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mm_result->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-    return Status{};
-}
-
-void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Set window for vector_sum_col
-    Window win_vector_sum_col = slice;
-    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Set window for vector_sum_row
-    Window win_vector_sum_row = slice;
-    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window biases_slice = slice;
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, mm_result, slice);
-        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
-        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
-        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
-        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 977f2eac53..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel
-{
-public:
-    ClGemmLowpOffsetContributionOutputStageKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _is_quantized_per_channel{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 8aec1654d9..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                                    const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-
-    return Status{};
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                                   const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    auto padding_info = get_padding_info({ src, bias, dst });
-
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = info->gemmlowp_min_bound;
-    auto           max = info->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Create src window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx1, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
deleted file mode 100644
index c935aa7ec4..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
- */
-class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel
-{
-public:
-    ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel);
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
-     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
deleted file mode 100644
index 9b488ff329..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                               const GEMMLowpOutputStageInfo *info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-
-    return Status{};
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              const GEMMLowpOutputStageInfo *info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    auto padding_info = get_padding_info({ src, bias, dst });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
-
-    auto min = info->gemmlowp_min_bound;
-    auto max = info->gemmlowp_max_bound;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
-    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx1, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index eff8c4b2be..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel
-{
-public:
-    ClGemmLowpQuantizeDownInt32ScaleByFloatKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
-     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
deleted file mode 100644
index 9a25973a93..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-
-    return Status{};
-}
-} //namespace
-
-ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
-
-    return Status{};
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                       const GEMMLowpOutputStageInfo *output_stage)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
-
-    auto padding_info = get_padding_info({ src, bias, dst });
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
-
-    // Set the arguments to pass at compile time
-    auto           min = output_stage->gemmlowp_min_bound;
-    auto           max = output_stage->gemmlowp_max_bound;
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
-    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx1, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index c5374755c8..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    ClGemmLowpQuantizeDownInt32ScaleKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel);
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
-     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
deleted file mode 100644
index b4886805fb..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
-    }
-    return Status{};
-}
-
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
-    }
-    return Status{};
-}
-} // namespace
-
-IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-
-    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    // This kernel does not need padding
-    Window win = calculate_max_window(*vector_sum_row, Steps());
-    ICLKernel::configure_internal(win);
-
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mtx_a->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mtx_a->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(mtx_a->dimension(2));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
-    Window slice_in  = collapsed.first_slice_window_2D();
-    Window slice_out = collapsed.first_slice_window_2D();
-
-    // Setup input slice. Its dimensions are increased in the cl kernel.
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice_in);
-        add_2D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-
-void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
-
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
-    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
-    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type()));
-    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration));
-    IClKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-
-    return Status{};
-}
-
-void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    Window slice_out = collapsed.first_slice_window_2D();
-    Window slice_in  = slice_out;
-
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice_in);
-        add_2D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h
deleted file mode 100644
index 11188ed062..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmLowpReductionKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
-#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Common interface for all OpenCL reduction kernels */
-class IClGemmLowpReductionKernel : public IClKernel
-{
-public:
-    IClGemmLowpReductionKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
deleted file mode 100644
index 6079644935..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float beta,
-                                 bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (src0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The src1 tensor cannot have more than 2 dimensions if src0 has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (src2 != nullptr)
-                                    && (!reshape_info.broadcast_bias()),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != src1->dimension(1));
-
-        if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int m         = reshape_info.reinterpret_input_as_3d() ? src0->dimension(1) * src0->dimension(2) : src0->dimension(1);
-            const unsigned int n         = src1->dimension(0);
-            const unsigned int src2_dim0 = src2->dimension(0);
-            const unsigned int src2_dim1 = src2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-    else
-    {
-        GEMMRHSMatrixInfo rhs_info;
-        GEMMLHSMatrixInfo lhs_info;
-        const auto        m                         = static_cast<unsigned int>(reshape_info.m());
-        const auto        n                         = static_cast<unsigned int>(reshape_info.n());
-        const int         k                         = reshape_info.k();
-        const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-        rhs_info.n0                                 = max_cl_vector_width / src1->element_size();
-        rhs_info.k0                                 = 1;
-        rhs_info.h0                                 = mult_transpose1xW_width;
-        rhs_info.interleave                         = false;
-        rhs_info.transpose                          = false;
-        lhs_info.m0                                 = 4;
-        lhs_info.k0                                 = 4;
-        lhs_info.v0                                 = mult_interleave4x4_height;
-        lhs_info.interleave                         = true;
-        lhs_info.transpose                          = true;
-
-        TensorShape tensor_shape0{ src0->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        TensorShape tensor_shape1{ src1->tensor_shape() };
-        tensor_shape1.set(0, n);
-        tensor_shape1.set(1, k);
-
-        const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-        const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-        const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-        if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-        {
-            const unsigned int src2_dim0 = src2->dimension(0);
-            const unsigned int src2_dim1 = src2->dimension(1);
-
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-            if(reshape_info.broadcast_bias())
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-            }
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                                                               float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
-                                                               ElementsProcessed &num_elements_processed)
-{
-    ARM_COMPUTE_UNUSED(beta);
-    bool   window_changed = false;
-    Window win{};
-    Window win_out{};
-
-    const DataType data_type                           = src0->data_type();
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_input_as_3d  = false;
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    if(is_interleaved_transposed)
-    {
-        // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
-        ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
-
-        // Configure kernel window
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        if(src2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
-            AccessWindowStatic src2_access(src2, 0, 0,
-                                           ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                           ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y));
-
-            window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        // Special case for 1xN, 2xN, 3xN and 4xN src0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
-        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(dst->dimension(1)), 4);
-
-        // Create kernels according to the architecture, data type and input size.
-        GPUTarget arch_target = get_arch_from_target(gpu_target);
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
-        {
-            num_elems_processed_per_iteration_x = (src1->dimension(0) <= 1000 && src0->num_dimensions() == 1) ? 2 : 4;
-        }
-
-        // Configure window
-        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
-        AccessWindowStatic src1_access(src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
-        AccessWindowStatic dst_access(dst, 0, 0,
-                                      dst->dimension(0),
-                                      dst->dimension(1));
-
-        if(src2 != nullptr)
-        {
-            const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-            AccessWindowStatic src2_access(src2, 0, 0,
-                                           ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                           src2->dimension(1));
-
-            window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-        }
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmMatrixMultiplyKernel::ClGemmMatrixMultiplyKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
-                                           float beta,
-                                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta,
-                                                  is_interleaved_transposed, reshape_info, fp_mixed_precision));
-
-    auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst });
-
-    _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
-    _add_bias                 = src2 != nullptr;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions();
-
-    _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0);
-
-    const DataType data_type = src0->data_type();
-
-    // Get target architecture
-    GPUTarget gpu_target = get_target();
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info,
-                                                    gpu_target, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
-    // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1)
-    const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1);
-    const unsigned int n          = dst->dimension(0);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    const unsigned int m0 = num_elements_processed.y();
-    const unsigned int n0 = num_elements_processed.x();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % m0;
-    const unsigned int partial_store_n0 = n % n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
-    build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
-    build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
-    build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0)));
-
-    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
-
-    std::string kernel_name;
-    if(is_interleaved_transposed)
-    {
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width)));
-        build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
-        build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-        }
-        else
-        {
-            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
-            if(fp_mixed_precision && data_type == DataType::F16)
-            {
-                // currently wider accumulator is only supported for fp16 kernels.
-                kernel_name += "_acc32";
-            }
-        }
-    }
-    else // The input tensors have not been reshaped
-    {
-        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
-        build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0)));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-        build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-        build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-        // Create kernels according to the architecture, data type and input size.
-        if(is_data_type_float(data_type) && is_bifrost)
-        {
-            kernel_name = "gemm_mm_floating_point";
-
-            if(src0->num_dimensions() != 1)
-            {
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-                if(fp_mixed_precision && data_type == DataType::F16)
-                {
-                    // currently wider accumulator is only supported for fp16 kernels.
-                    kernel_name += "_acc32";
-                }
-            }
-            else if(src1->dimension(0) <= 1000 && data_type == DataType::F32)
-            {
-                // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and
-                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g.
-                // FC6 and FC7 of AlexNet and VGG-16).
-                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
-            }
-
-            // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
-            // via exhaustive autotuning over a range of representative layer configurations.
-            set_lws_hint(cl::NDRange(4));
-        }
-        else // (MIDGARD and F32) or (F16)
-        {
-            kernel_name = "gemm_mm_floating_point";
-        }
-    }
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_";
-    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : "");
-    _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(3));
-    _config_id += "_";
-    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1)));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
-{
-    // Note: num_elements_processed will be set in validate_and_configure_window()
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(activation_info);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              beta,
-                                                              is_interleaved_transposed,
-                                                              reshape_info,
-                                                              gpu_target,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, src2, slice);
-        }
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
deleted file mode 100644
index c303f78b07..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p src0 and @p src1 have been reshaped respectively with @ref ClGemmReshapeLhsMatrixKernel" and @ref ClGemmReshapeRhsMatrixKernel,
- *       the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p src1 tensor must have at least 2 dimensions (matrix)
- */
-class ClGemmMatrixMultiplyKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel);
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context           The compile context to be used.
-     * @param[in]  src0                      Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  src1                      Input tensor containing the Matrix B. Data type supported: same as @p src0
-     * @param[in]  src2                      Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p src0
-     * @param[out] dst                       Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref ClGemmReshapeLhsMatrixKernel and @ref ClGemmReshapeRhsMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _add_bias{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
deleted file mode 100644
index 5ae55ab04a..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    ARM_COMPUTE_UNUSED(m);
-    ARM_COMPUTE_UNUSED(n);
-    ARM_COMPUTE_UNUSED(k);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
-    }
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
-                                                 float                    beta,
-                                                 const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ src0, dst });
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-
-    std::string kernel_name("gemm_mm_native");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        }
-        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    if(_reinterpret_output_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src0, slice);
-        add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
-        {
-            add_2D_tensor_argument(idx, src2, slice);
-        }
-        add_2D_tensor_argument(idx, dst, slice);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
deleted file mode 100644
index cd7bf278c2..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class ClGemmMatrixMultiplyNativeKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel);
-    /** Initialise the kernel's input and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  src1            Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             dst tensor info. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _add_bias{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
deleted file mode 100644
index 591834f762..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                          const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMKernelInfo    &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
-    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape0{ src0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
-    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y));
-
-        window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    auto padding_info         = get_padding_info({ src0, dst });
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _k                        = gemm_info.k;
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
-    const DataType data_type              = src0->data_type();
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    std::string kernel_name("gemm_mm_reshaped_");
-    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
-    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                    const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
-
-    cl::Image2D src1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
-        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
-
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, src0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, src1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, src1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
-
-        // dst buffer
-        add_2D_tensor_argument(idx, dst, slice);
-
-        // K dimension (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
-        }
-
-        // dst stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-
-        // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
-        }
-
-        // Dispatch kernel
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index b8ae4b9ae3..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
- *
- * @note The input matrices @p src0 and @p src1 must be reshaped through:
- *  - @ref ClGemmReshapeLhsMatrixKernel
- *  - @ref ClGemmReshapeRhsMatrixKernel
- */
-class ClGemmMatrixMultiplyReshapedKernel : public IClKernel
-{
-public:
-    ClGemmMatrixMultiplyReshapedKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
-     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ClCompileContext &compile_context,
-                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    bool         _use_dummy_work_items{ false };
-    bool         _add_bias{ false };
-    bool         _export_to_cl_image{ false };
-    unsigned int _k{ 1 };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
deleted file mode 100644
index 32ee0f9705..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
-    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
-
-    const unsigned int m = gemm_info.m;
-    const unsigned int n = gemm_info.n;
-    const unsigned int k = gemm_info.k;
-
-    TensorShape tensor_shape1{ src1->tensor_shape() };
-    tensor_shape1.set(0, n);
-    tensor_shape1.set(1, k);
-
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
-    {
-        const unsigned int src2_dim0 = src2->dimension(0);
-        const unsigned int src2_dim1 = src2->dimension(1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
-        }
-    }
-
-    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
-
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
-
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
-    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
-
-    TensorInfo tmp_info(*dst);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Configure kernel window
-    num_elems_processed_per_iteration_x = rhs_info.n0;
-    num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    if(src2 != nullptr)
-    {
-        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
-
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
-                                       src2->dimension(1));
-
-        window_changed = update_window_and_padding(win, src2_access);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
-{
-    _type = CLKernelType::GEMM;
-}
-
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-
-    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
-    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
-    _add_bias                 = src2 != nullptr;
-    _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _has_pad_y                = gemm_info.has_pad_y;
-
-    auto padding_info = get_padding_info({ src0, src1, dst });
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
-    {
-        _reinterpret_input_as_3d  = false;
-        _reinterpret_output_as_3d = false;
-    }
-
-    // Check if we need to slide the matrix B
-    const unsigned int num_dimensions_src0 = src0->num_dimensions();
-    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
-    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
-    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
-
-    // These variables are used only if gemm_info.has_pad_y == true
-    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
-    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
-
-    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
-    // NOTE: This might have implications on heuristics and performance
-    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
-
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % internal_m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
-    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
-    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
-    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
-    if(_has_pad_y)
-    {
-        build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
-    }
-
-    std::string kernel_name("gemm_mm_reshaped_only_rhs_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += (_has_pad_y ? "" : "no_pad_y_");
-    _config_id += (_add_bias ? "add_bias_" : "");
-    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
-    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
-    _config_id += lower_string(string_from_data_type(src0->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(gemm_info.k);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.n0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.h0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(rhs_info.interleave);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                                                           const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
-{
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
-                                                              num_elements_processed)
-                                .first);
-
-    return Status{};
-}
-
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
-
-    if(src1->info()->num_dimensions() < 3)
-    {
-        // The stride_z for matrix B must be zero if we do not slice
-        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
-    }
-
-    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
-    const size_t rhs_idx_batch_size = 2u;
-    const size_t bia_idx_batch_size = 2u;
-    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
-
-    Window slice          = window.first_slice_window_3D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Get cross plane pads
-    const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
-    const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom;
-
-    // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
-    ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
-
-    cl::Image2D src1_image2d;
-
-    if(_export_to_cl_image)
-    {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
-        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
-
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
-    }
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        // LHS buffer
-        add_2D_tensor_argument(idx, src0, slice);
-
-        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
-        {
-            _kernel.setArg(idx++, src1_image2d);
-        }
-        else
-        {
-            add_2D_tensor_argument(idx, src1, slice_b);
-        }
-
-        // Bias buffer (_add_bias == true)
-        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
-
-        // dst buffer
-        add_2D_tensor_argument(idx, dst, slice);
-
-        // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
-
-        // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
-
-        // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
-        }
-
-        // dst stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
-
-        // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
-        }
-
-        // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
-        {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
-        }
-
-        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
deleted file mode 100644
index 3d6164eca9..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped
- *
- * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel
- */
-class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel
-{
-public:
-    ClGemmMatrixMultiplyReshapedOnlyRhsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
-     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
-     *       the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
-     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src0            Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
-     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
-     * @param[out] dst             Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.transpose: true,false
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ClCompileContext &compile_context,
-                   ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _add_bias{ false };
-    bool _export_to_cl_image{ false };
-    bool _has_pad_y{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
deleted file mode 100644
index f92945e2a4..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
-    const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
-    bool               window_changed                      = false;
-
-    TensorInfo tmp_info(*src);
-
-    if(reinterpret_input_as_3d)
-    {
-        // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(src->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
-
-    // Configure window
-    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic src_access(src, 0, 0,
-                                  src->dimension(0),
-                                  src->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
-
-    window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win, dst_access);      // window used to update the padding requirements of dst tensor
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
-
-    auto padding_info = get_padding_info({ src });
-
-    _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
-    const unsigned int src_w           = src->dimension(0);
-    const unsigned int src_h           = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
-    const unsigned int partial_load_m0 = src_h % lhs_info.m0;
-    const unsigned int partial_load_k0 = src_w % lhs_info.k0;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
-    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
-    build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
-    build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
-
-    std::string kernel_name("gemm_reshape_lhs_matrix_");
-    kernel_name += lhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "gemm_reshape_lhs_matrix_";
-    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.m0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.k0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.v0);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.interleave);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(lhs_info.transpose);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first);
-
-    return Status{};
-}
-
-void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window slice = window.first_slice_window_3D();
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
-        const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
deleted file mode 100644
index 73d811f3c3..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- *  In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- *  stores each one in the dst matrix unrolling the values
- */
-class ClGemmReshapeLhsMatrixKernel : public ICLKernel
-{
-public:
-    ClGemmReshapeLhsMatrixKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  src                   Input tensor. Data types supported: All
-     * @param[out] dst                   Output tensor. Data type supported: same as @p src
-     * @param[in]  lhs_info              LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                   information to reshape the src tensor. Only the following values are supported:
-     *                                   lhs_info.m0: 2,3,4,5,6,7,8
-     *                                   lhs_info.k0: 2,3,4,8,16
-     *                                   lhs_info.v0: greater than 0
-     *                                   lhs_info.transpose: true, false
-     *                                   lhs_info.interleave: true, false
-     * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool _reinterpret_input_as_3d{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
deleted file mode 100644
index 3a6f3c7e8f..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
-    ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(rhs_info.export_to_cl_image)
-    {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
-        ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
-    }
-
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
-    const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
-    bool               window_changed                      = false;
-
-    // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
-
-    // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-    window_changed = update_window_and_padding(win, src_access);
-
-    if(rhs_info.export_to_cl_image)
-    {
-        gemm::update_padding_for_cl_image(dst);
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info));
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
-    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-    build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE");
-    build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-
-    std::string kernel_name("gemm_reshape_rhs_matrix_");
-    kernel_name += rhs_info.transpose ? "t" : "nt";
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, rhs_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-}
-
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
-
-    return Status{};
-}
-
-void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
deleted file mode 100644
index 27f80d3428..0000000000
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- *  In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in
- *  the dst matrix unrolling the values */
-class ClGemmReshapeRhsMatrixKernel : public ICLKernel
-{
-public:
-    ClGemmReshapeRhsMatrixKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel);
-    /** Initialise the kernel's input and output.
-     *
-     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
-     *       required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
-     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
-     *       -# rhs_info.n0 can only be 4, 8 and 16
-     *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32, F16
-     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
-     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
-     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
-     *       -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Input tensor. Data types supported: All
-     * @param[out] dst             Output tensor. Data type supported: same as @p src
-     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                             information to reshape the src tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
-     *                             rhs_info.h0: greater than 0
-     *                             rhs_info.transpose: true, false
-     *                             rhs_info.interleave: true, false
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
deleted file mode 100644
index 9ff30eedcd..0000000000
--- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
-    return Status{};
-}
-
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    _height_offset = height_offset;
-
-    // Add build options
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
-    // Configure kernel window
-
-    // The window needs to be based on src as we copy all the heights of src
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
deleted file mode 100644
index 0733078fc2..0000000000
--- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the height concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class ClHeightConcatenateKernel : public IClKernel
-{
-public:
-    ClHeightConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the dst tensor.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClHeightConcatenateKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _height_offset;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp
deleted file mode 100644
index 61ee443aa5..0000000000
--- a/src/core/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClIm2ColKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-struct Im2ColConfiguration
-{
-    std::string           kernel_name{};
-    std::set<std::string> build_options{};
-    unsigned int          num_elems_processed_per_iteration{};
-    bool                  is_padding_required_nchw{};
-};
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
-{
-    const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(channel_idx) % num_groups) != 0);
-
-    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = src->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
-    const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
-    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
-
-    if(dst->total_size() > 0)
-    {
-        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
-
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
-
-    const DataLayout   data_layout  = src->data_layout();
-    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = src->dimension(width_idx);
-    const unsigned int input_height = src->dimension(height_idx);
-
-    // Configure the execute window based on the selected optimal OpenCL kernel
-    bool   window_changed = false;
-    Window win;
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    }
-    else
-    {
-        if(is_padding_required_nchw)
-        {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*src,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(src,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
-            window_changed = window_changed || update_window_and_padding(win, input_access);
-        }
-        else
-        {
-            // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
-            // update_window_and_padding() can be skipped
-            win = calculate_max_window(*src, Steps());
-        }
-    }
-
-    // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
-    win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
-{
-    const DataLayout   data_layout   = src->data_layout();
-    const DataType     data_type     = src->data_type();
-    const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int input_width   = src->dimension(width_idx);
-    const unsigned int input_height  = src->dimension(height_idx);
-    const unsigned int input_channel = src->dimension(channel_idx);
-
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
-    // Im2Col configuration
-    std::string                   kernel_name = "im2col_generic_";
-    CLBuildOptions                build_opts;
-    unsigned int                  num_elems_processed_per_iteration = 1;
-    bool                          is_padding_required_nchw          = false;
-    const UniformQuantizationInfo qinfo                             = src->quantization_info().uniform();
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-    build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
-    build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
-    build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
-    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
-    build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
-    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
-
-    if(data_layout == DataLayout::NHWC)
-    {
-        num_elems_processed_per_iteration = std::min(2U, input_channel);
-        is_padding_required_nchw          = false;
-
-        // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
-        {
-            kernel_name = "im2col3x3_";
-        }
-        else if(kernel_dims == Size2D(9U, 9U))
-        {
-            kernel_name = "im2col9x9_";
-        }
-
-        // Get boundary vector (the first/last vector with potentially a partial vector size) size
-        // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
-        // otherwise, the boundary vec size is the (partial) remainder vector size
-        const unsigned int vec_size          = num_elems_processed_per_iteration;
-        const unsigned int partial_vec_size  = input_channel % vec_size;
-        const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
-        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
-    }
-    else
-    {
-        if(dilation == Size2D(1U, 1U))
-        {
-            const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
-            {
-                // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
-                {
-                    case 1:
-                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col1x1_stridex1_";
-                            num_elems_processed_per_iteration = 4;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    case 3:
-                        kernel_name                       = "im2col3x3_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 5:
-                        kernel_name                       = "im2col5x5_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = true;
-                        break;
-                    case 11:
-                        // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
-                        {
-                            kernel_name                       = "im2col11x11_padx0_pady0_";
-                            num_elems_processed_per_iteration = 1;
-                            is_padding_required_nchw          = true;
-                        }
-                        break;
-                    default:
-                        kernel_name                       = "im2col_generic_";
-                        num_elems_processed_per_iteration = 1;
-                        is_padding_required_nchw          = false;
-                        break;
-                }
-            }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
-            {
-                kernel_name                       = "im2col_generic_padx0_pady0_";
-                num_elems_processed_per_iteration = 1;
-                is_padding_required_nchw          = false;
-
-                // Optimized im2col is performed using one or more vector operations with the specified vector size
-                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-                // Using the vector size of 8, however, may be faster.
-                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-                // is used instead.)
-                const size_t vector_size           = std::min(static_cast<size_t>(4), kernel_dims.width);
-                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
-            }
-        }
-    }
-
-    // Append the data layout to the kernel_name
-    kernel_name += lower_string(string_from_data_layout(data_layout));
-
-    Im2ColConfiguration im2col_config;
-    im2col_config.kernel_name                       = kernel_name;
-    im2col_config.build_options                     = build_opts.options();
-    im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
-    im2col_config.is_padding_required_nchw          = is_padding_required_nchw;
-
-    return im2col_config;
-}
-} // namespace
-
-ClIm2ColKernel::ClIm2ColKernel()
-    : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-
-    auto padding_info = get_padding_info({ src, dst });
-    _data_layout      = src->data_layout();
-
-    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int input_width  = src->dimension(width_idx);
-    const unsigned int input_height = src->dimension(height_idx);
-
-    // Select and configure the optimal OpenCL kernel to run.
-    // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
-    // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
-
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
-    _kernel_dims                       = kernel_dims; // Only needed by the Tuner
-    _conv_info                         = conv_info;   // Only needed by the Tuner
-    _num_groups                        = num_groups;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                    im2col_config.is_padding_required_nchw, num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = im2col_config.kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(num_groups);
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
-                                                              im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
-    return Status{};
-}
-
-void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(tensors.empty());
-
-    // Get initial windows
-    // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    window_collapsed.set_dimension_step(Window::DimZ, 1);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(dst->info()->tensor_shape());
-
-    const Window first_slice_3d = window_collapsed.first_slice_window_3D();
-
-    Window slice     = first_slice_3d;
-    Window slice_in  = first_slice_3d;
-    Window slice_out = window_output.first_slice_window_2D();
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
-        const int    num_batches = tmp_win[3].end();
-
-        slice.set(1, Window::Dimension(0, static_cast<int>(dst->info()->tensor_shape()[1]), 1));
-        slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
-    }
-    else
-    {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
-        slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
-        // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
-    }
-
-    // Setup input slice
-    // The dimensions of the input are increased within the OpenCL kernel
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Setup output slice
-    // The dimensions of the output are increased within the OpenCL kernel
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice_in);
-        if(_num_groups == 1)
-        {
-            add_2D_tensor_argument(idx, dst, slice_out);
-        }
-        else
-        {
-            add_3D_tensor_argument(idx, dst, slice_out);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClIm2ColKernel.h b/src/core/gpu/cl/kernels/ClIm2ColKernel.h
deleted file mode 100644
index d1443f0434..0000000000
--- a/src/core/gpu/cl/kernels/ClIm2ColKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_IM2COL_KERNEL_H
-#define ARM_COMPUTE_CL_IM2COL_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Size2D.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class ClIm2ColKernel : public IClKernel
-{
-public:
-    /** Default constructor */
-    ClIm2ColKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIm2ColKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] dst             The output tensor info. First 2 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims     The kernel dimensions (width and height).
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClIm2ColKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    DataLayout _data_layout;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_IM2COL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp
deleted file mode 100644
index 7c4dddc20e..0000000000
--- a/src/core/gpu/cl/kernels/ClMulKernel.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClMulKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(overflow_policy);
-    ARM_COMPUTE_UNUSED(rounding_policy);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (src1 == dst) || (src2 == dst);
-    const bool src1_in_place = in_place && (src1 == dst);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
-                                        "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
-                                        "Dst must be S32 if source tensors are S32");
-        if(in_place)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
-                                            "Wrong shape for dst, cannot do in_place calculation");
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
-                                            "Wrong shape for dst");
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClMulKernel::ClMulKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
-
-    int scale_int = -1;
-    // Extract sign, exponent and mantissa
-    int   exponent            = 0;
-    float normalized_mantissa = std::frexp(scale, &exponent);
-    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-    // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-    {
-        // Store the positive exponent. We know that we compute 1/2^n
-        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-        scale_int = std::abs(exponent - 1);
-    }
-
-    std::string acc_type;
-    // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
-    {
-        scale_int = -1;
-        acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
-    }
-    else
-    {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
-        {
-            // use 64 bit accumulator for 32-bit input
-            acc_type = "long";
-        }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
-        {
-            // Use 32-bit accumulator for 16-bit input
-            acc_type = "int";
-        }
-        else
-        {
-            // Use 16-bit accumulator for 8-bit input
-            acc_type = "ushort";
-        }
-    }
-
-    const bool         is_quantized      = is_data_type_quantized(src1->data_type());
-    const unsigned int vec_size          = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
-    const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
-
-    // Set kernel build options
-    std::string    kernel_name = "pixelwise_mul";
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()),
-                                 "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()),
-                                 "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()),
-                                 "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-        kernel_name += "_quantized";
-    }
-    else
-    {
-        kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
-        build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
-        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
-        {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-            build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        }
-    }
-
-    // Check whether it is in_place calculation
-    const bool in_place      = (src1 == dst) || (src2 == dst);
-    const bool src1_in_place = in_place && (src1 == dst);
-    build_opts.add_option_if(in_place, "-DIN_PLACE");
-    build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set scale argument
-    unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-
-    if(scale_int >= 0 && !is_quantized)
-    {
-        _kernel.setArg(idx++, scale_int);
-    }
-    else
-    {
-        _kernel.setArg(idx++, scale);
-    }
-
-    Window win = calculate_max_window(*dst, Steps(vec_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(dst->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-}
-
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
-
-    return Status{};
-}
-
-void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    // Check whether it is in_place calculation
-    const bool in_place = (src_0 == dst) || (src_1 == dst);
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        if(!in_place)
-        {
-            add_3D_tensor_argument(idx, dst, slice);
-        }
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-
-namespace
-{
-constexpr unsigned int vec_size_complex = 1;
-
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
-
-    // Validate in case of configured dst
-    if(dst->total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClComplexMulKernel::ClComplexMulKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
-    {
-        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
-
-    Window win = calculate_max_window(*dst, Steps(vec_size_complex));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
-
-    return Status{};
-}
-
-void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
-    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
-    const TensorShape &out_shape = dst->info()->tensor_shape();
-
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-    {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src_0, slice_input1);
-        add_3D_tensor_argument(idx, src_1, slice_input2);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h
deleted file mode 100644
index 2ee182b932..0000000000
--- a/src/core/gpu/cl/kernels/ClMulKernel.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H
-#define ARM_COMPUTE_CL_MUL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the pixelwise multiplication kernel.
- *
- * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
- *
-*/
-class ClMulKernel : public IClKernel
-{
-public:
-    ClMulKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel);
-    /** Initialise the kernel's src and dst.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (S32,S32)                       -> S32
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[in]  src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[out] dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class ClComplexMulKernel : public ICLKernel
-{
-public:
-    ClComplexMulKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel);
-    /** Initialise the kernel's src and dst.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            An src tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClComplexMulKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp b/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
deleted file mode 100644
index 722bf454f2..0000000000
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClPermuteKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm)
-{
-    TensorShape dst_shape = src->tensor_shape();
-    permute(dst_shape, perm);
-    return dst_shape;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4,
-                                    "Permutation up to 4-D src tensor is supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
-                                    "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
-    }
-
-    // Validate configured dst
-    if(dst->total_size() != 0)
-    {
-        const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-    return Status{};
-}
-} // namespace
-
-ClPermuteKernel::ClPermuteKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
-    const TensorShape dst_shape    = get_dst_shape(src, perm);
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
-
-    _perm = perm;
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
-    build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
-    // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
-    build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
-    build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
-    build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
-    build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
-
-    _kernel = create_kernel(compile_context, "permute", build_opts.options());
-
-    // Configure  kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    ICLKernel::configure_internal(win);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
-
-    return Status{};
-}
-
-void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-    // Setup dst slice
-    Window slice_out(slice_in);
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    slice_out.set(3, Window::Dimension(0, 0, 0));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice_in);
-        add_4D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.h b/src/core/gpu/cl/kernels/ClPermuteKernel.h
deleted file mode 100644
index 839e224ee4..0000000000
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H
-#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class ClPermuteKernel : public IClKernel
-{
-public:
-    ClPermuteKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     * @param[in] perm            Permutation vector
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClPermuteKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    PermutationVector _perm{};
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
deleted file mode 100644
index e522814b6d..0000000000
--- a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClPool2dKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-// Internal window config info
-using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info)
-{
-    TensorShape out_shape = compute_pool_shape(*src, pool_info);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
-    {
-        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
-    }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
-    // Check indices
-    if(indices)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-
-        if(indices->total_size() != 0)
-        {
-            TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
-        }
-    }
-
-    // Checks performed when dst is configured
-    if(dst->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Get data layout
-    const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    int                 pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    int                 pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int  pool_pad_right  = pad_stride_info.pad_right();
-    const int  pool_pad_top    = pad_stride_info.pad_top();
-    const int  pool_pad_left   = pad_stride_info.pad_left();
-    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize();
-
-    auto_init(src, dst, indices, pool_info);
-    pooled_w = dst->tensor_shape()[idx_width];
-    pooled_h = dst->tensor_shape()[idx_height];
-
-    const DataType data_type = src->data_type();
-
-    const int src_width  = src->dimension(idx_width);
-    const int src_height = src->dimension(idx_height);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    bool         window_changed                    = false;
-    Window       win{};
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            // Initialize border size
-            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-            // Change the number of elements processed per iteration
-            // for pooling 3x3 with stride less equal than 3
-            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
-            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
-            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
-
-            // Number of iterations in X dimension
-            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
-            // Upper limit for the number of right/bottom border elements that are accessed
-            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-
-            border_size.right  = std::max(upper_bound_w, pool_pad_right);
-            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-
-            AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
-                                             pool_stride_x, pool_stride_y);
-            AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
-            {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
-            {
-                window_changed = update_window_and_padding(win, src_access, dst_access);
-            }
-
-            dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
-
-            // Initialize border size
-            border_size                       = BorderSize();
-            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
-            win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size));
-}
-} // namespace
-
-ClPool2dKernel::ClPool2dKernel()
-{
-    _type = CLKernelType::POOL;
-}
-
-BorderSize ClPool2dKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst, indices });
-
-    // Set instance variables
-    _pool_info                          = pool_info;
-    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int pool_pad_top  = pad_stride_info.pad_top();
-    const int pool_pad_left = pad_stride_info.pad_left();
-
-    // Set build options
-    CLBuildOptions build_opts;
-    const DataType data_type = src->data_type();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, pool_info, indices);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    ClPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
-
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
-
-    // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-    {
-        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left));
-        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right));
-        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top));
-        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom));
-        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel)));
-        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    }
-
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Check dst dimensions
-    auto_init(src, dst, indices, pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
-    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
-    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
-    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
-    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
-    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
-    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
-
-    // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
-    {
-        if(is_data_type_quantized(data_type))
-        {
-            PixelValue type_min{};
-            std::tie(type_min, std::ignore) = get_min_max(data_type);
-            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
-        }
-        else
-        {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
-        }
-    }
-    else
-    {
-        // Pool AVG and Pool L2 initial value
-        build_opts.add_option("-DINITIAL_VALUE=0");
-    }
-
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
-
-    // Create kernel
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
-            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
-            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
-            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
-
-            if(pool_type != PoolingType::MAX)
-            {
-                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            }
-
-            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
-            {
-                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
-                // each thread computes 4 dst elements
-                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
-
-                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                          + support::cpp11::to_string(pool_size_x);
-                _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-            {
-                // For max pooling with pool2x2, store indicies which will be used in max unpooling
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-            }
-            else // Run general case
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
-
-            // Wider accumulation is required to avoid accuracy loss
-            // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
-            // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
-            DataType acc_data_type = data_type;
-
-            if(use_fp_mixed_precision)
-            {
-                acc_data_type = DataType::F32;
-            }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
-            {
-                acc_data_type = DataType::S32;
-            }
-
-            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
-            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
-            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
-            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
-            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
-            {
-                build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
-
-                std::string kernel_name = "pooling_layer_2x2_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else
-            {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
-                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "pooling_layer_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(src->data_layout()));
-
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
-}
-
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
-
-    return Status{};
-}
-
-void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window_collapsed.first_slice_window_3D();
-            do
-            {
-                // Upsample src by pool size
-                Window in_slice(slice);
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
-                                                             (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
-                                                             pool_stride_x * _num_elems_processed_per_iteration));
-                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
-                                                             (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
-                                                             pool_stride_y));
-
-                // Set srcs
-                unsigned int idx = 0;
-                add_3D_tensor_argument(idx, src, in_slice);
-                add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_3D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
-
-            Window slice    = window_collapsed.first_slice_window_4D();
-            Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
-            in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
-            in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
-            in_slice.set(3, Window::Dimension(0, batch_size, 1));
-            do
-            {
-                // Set srcs
-                unsigned int idx = 0;
-                add_4D_tensor_argument(idx, src, in_slice);
-                add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
-                {
-                    add_4D_tensor_argument(idx, indices, slice);
-                }
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.h b/src/core/gpu/cl/kernels/ClPool2dKernel.h
deleted file mode 100644
index ab8c56a857..0000000000
--- a/src/core/gpu/cl/kernels/ClPool2dKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
-#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class ClPool2dKernel : public IClKernel
-{
-public:
-    ClPool2dKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
-
-    /** Configure kernel for a given list of arguments
-     *
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClPool2dKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    BorderSize       _border_size{ 0 };
-    unsigned int     _num_elems_processed_per_iteration{ 1 };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
deleted file mode 100644
index 7900489db7..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClQuantizeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // Output must always be initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    return Status{};
-}
-} // namespace
-
-ClQuantizeKernel::ClQuantizeKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    const int  vec_size_x     = 16 / src->element_size();
-    const int  input_width_x  = src->tensor_shape().x();
-    const bool multi_access_x = (input_width_x / vec_size_x > 0);
-
-    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
-    const DataType                output_data_type = dst->data_type();
-
-    float   scale_to_apply  = qinfo.scale;
-    int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
-    {
-        /*
-         * In case of requantization of a quantized input tensor to an output tensor with another quantization
-         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
-         * offset to apply.
-         *
-         * Assuming:
-         *   - q_i as input quantized value
-         *   - q_o as output quantized value
-         *   - z_i as input quantization offset value
-         *   - z_o as output quantization offset value
-         *   - s_i as input quantization scale value
-         *   - s_o as output quantization scale value
-         *   - z_n as new quantization offset value
-         *   - s_n as new quantization scale value
-         *
-         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
-         *
-         * We can rewrite the formula as:
-         *
-         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
-         *
-         * q_o = q_i / s_n + z_n
-         *
-         * Where:
-         *
-         * s_n = s_o / s_i
-         *
-         * z_n = - z_i * s_i / s_o + z_o
-         *
-         */
-        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
-        scale_to_apply /= qinfo_in.scale;
-        // In order to minimize flooring we convert the offset to a float,
-        // then compute the new offset in the float domain,
-        // finally we convert it back as int32_t
-        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
-    }
-
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
-    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
-    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
-
-    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-    return Status{};
-}
-
-void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.h b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
deleted file mode 100644
index 1991a2fba8..0000000000
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class ClQuantizeKernel : public IClKernel
-{
-public:
-    ClQuantizeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] dst             Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClQuantizeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
deleted file mode 100644
index fcda061930..0000000000
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClReshapeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include <string>
-
-/** [ClReshapeKernel Kernel] **/
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    if(dst->tensor_shape().total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClReshapeKernel::ClReshapeKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
-    _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
-
-    // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
-    _kernel.setArg<cl_int2>(idx++, src_shape);
-    _kernel.setArg<cl_int2>(idx++, dst_shape);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src);
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
-    return Status{};
-}
-
-void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Set srcs
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, src, window_collapsed);
-    add_3D_tensor_argument(idx, dst, window_collapsed);
-    enqueue(queue, *this, slice, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-/** [ClReshapeKernel Kernel] **/
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.h b/src/core/gpu/cl/kernels/ClReshapeKernel.h
deleted file mode 100644
index 01e1ee84b9..0000000000
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the kernel to perform tensor reshaping */
-class ClReshapeKernel : public IClKernel
-{
-public:
-    ClReshapeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel);
-    /** Set the src and dst of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data type supported: All.
-     * @param[out] dst             Destination tensor info. Data type supported: Same as @p src
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClReshapeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace opencl
-} // namespace kernels
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
deleted file mode 100644
index ee4ee22aa0..0000000000
--- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClScaleKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-inline std::pair<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
-{
-    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the ratio between source width/height and destination width/height
-    const unsigned int src_width  = src->dimension(idx_width);
-    const unsigned int src_height = src->dimension(idx_height);
-    const unsigned int dst_width  = dst->dimension(idx_width);
-    const unsigned int dst_height = dst->dimension(idx_height);
-
-    float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
-    float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
-
-    return std::make_pair(scale_x, scale_y);
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
-
-    float            scale_x     = 0.f;
-    float            scale_y     = 0.f;
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
-
-    return Status{};
-}
-} // namespace
-
-Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
-    return Status{};
-}
-
-ClScaleKernel::ClScaleKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Info required for the static tuning
-    _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    const bool is_nhwc = _data_layout == DataLayout::NHWC;
-
-    float scale_x = 0.f;
-    float scale_y = 0.f;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
-    const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
-    {
-        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
-
-    // Create kernel
-    const int          idx_width         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int          idx_height        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int src_width         = src->dimension(idx_width);
-    const unsigned int src_height        = src->dimension(idx_height);
-    const unsigned int dst_width         = dst->dimension(idx_width);
-    const unsigned int vec_size          = adjust_vec_size(is_nhwc ? 1 : 4, dst_width);
-    const unsigned int vec_size_leftover = (dst_width % vec_size);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
-    build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
-    build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
-
-    build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
-    build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
-    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-    build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
-    build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
-    build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
-    build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-    if(is_qasymm_bilinear)
-    {
-        const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
-        build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
-        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
-    }
-    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name + "_";
-    kernel_name += lower_string(string_from_data_layout(_data_layout));
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps(vec_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "scale_";
-    _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
-    _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
-    _config_id += (is_nhwc ? "nhwc" : "nchw");
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(3));
-}
-
-void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    switch(_data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-
-            do
-            {
-                unsigned int idx = 0;
-                add_2D_tensor_argument(idx, src, slice);
-                add_2D_tensor_argument(idx, dst, slice);
-                enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
-            Window slice     = collapsed.first_slice_window_4D();
-
-            unsigned int idx = 0;
-            add_4D_tensor_argument(idx, src, slice);
-            add_4D_tensor_argument(idx, dst, slice);
-            enqueue(queue, *this, slice, lws_hint());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h
deleted file mode 100644
index 6674931296..0000000000
--- a/src/core/gpu/cl/kernels/ClScaleKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H
-#define ARM_COMPUTE_CL_SCALE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the scale kernel */
-class ClScaleKernel : public IClKernel
-{
-public:
-    ClScaleKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClScaleKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    DataLayout _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SCALE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
deleted file mode 100644
index 1dd905d66e..0000000000
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- *             it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts  Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta        Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
-    // Number of integer bits in temporary fixed-point representation of current-to-max difference
-    static const int scaled_diff_int_bits = 5;
-    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
-    static const int exp_accumulation_in_bits = 12;
-
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
-    int input_beta_multiplier;
-    int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
-
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
-    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
-    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
-    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
-    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
-    return build_opts;
-}
-
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
-
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
-
-    // Checks performed when sum is configured
-    if(sum.total_size() != 0)
-    {
-        if(is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum);
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-
-    // Checks performed when output is configured
-    if(dst.total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-            ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info);
-        }
-    }
-
-    return Status{};
-}
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
-    auto_init_if_empty(dst, *src.clone());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-
-    const DataType                dt                 = src.data_type();
-    const UniformQuantizationInfo qinfo              = src.quantization_info().uniform();
-    const size_t                  reduction_dim_size = src.dimension(0);
-    const float                   beta               = info.beta;
-    const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
-    const unsigned int    vector_size             = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-    build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
-    cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
-
-    // Configure parallel kernel if needed
-    if(std::get<0>(parallel_reduction_info))
-    {
-        kernel_name += "parallel";
-        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
-        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
-
-        // Handle boundary conditions.
-        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
-        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
-        // Setting _lws_hint in this way can also communicate grid_size to ClLogits1DMaxShiftExpSumKernel::run().
-        // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
-        lws_hint = cl::NDRange(_grid_size);
-    }
-    else
-    {
-        kernel_name += "serial";
-    }
-
-    // Create kernel.
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(src, Steps(reduction_dim_size));
-    IClKernel::configure_internal(win, lws_hint);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-    return Status{};
-}
-
-ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
-    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
-    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
-    return std::make_tuple(is_parallel_reduction, vector_size);
-}
-
-void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto max = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_1));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum);
-
-    // Collapse window in Z dimension
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    // Reconfigure window in case of parallel reduction
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
-    {
-        // Launch grid_size parallel work items
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
-    }
-
-    // Get slices
-    Window slice = window_collapsed.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, max, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_3D_tensor_argument(idx, sum, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-ClLogits1DNormKernel::ClLogits1DNormKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
-
-    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-    const unsigned int vector_size       = adjust_vec_size(16, src.dimension(0));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-
-    // Create kernel
-    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    auto win = calculate_max_window(src, Steps(vector_size));
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
-
-    return Status{};
-}
-
-void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, sum, sum_slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h b/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
deleted file mode 100644
index a2ad02d6b7..0000000000
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for max, shifting, exponentiating and summing the logits */
-class ClLogits1DMaxShiftExpSumKernel : public IClKernel
-{
-    /**< Grid size (obtained through auto-tuning) */
-    static const unsigned int _grid_size;
-    /**< Vector size in the serial case (obtained through auto-tuning) */
-    static const unsigned int _serial_vector_size;
-    /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-    static const unsigned int _parallel_vector_size;
-
-public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-    ClLogits1DMaxShiftExpSumKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel);
-    /** Configure the kernel using the given information about tensors
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p src
-     * @param[out]    dst             Destination tensor. Data types supported: same as @p src
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
-     *
-     * @param[in] size Size to check
-     *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
-     */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class ClLogits1DNormKernel : public IClKernel
-{
-public:
-    ClLogits1DNormKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel);
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DNormKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp b/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
deleted file mode 100644
index 40bd4b034a..0000000000
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClTransposeKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-ClTransposeKernel::ClTransposeKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output auto initialization if not yet initialized
-    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
-    ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
-    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
-    const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size()));
-    build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
-    build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
-
-    _kernel = create_kernel(compile_context, "transpose", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y));
-    ICLKernel::configure_internal(win, cl::NDRange(2, 8));
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
-
-    // Validate configured dst
-    if(dst->total_size() != 0)
-    {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    }
-
-    return Status{};
-}
-
-void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, src, slice);
-        add_2D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.h b/src/core/gpu/cl/kernels/ClTransposeKernel.h
deleted file mode 100644
index c8379d44c7..0000000000
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to transpose a 2D tensor. */
-class ClTransposeKernel : public IClKernel
-{
-public:
-    ClTransposeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel);
-    /** Set the src and dst of the kernel.
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The src tensor info. Data types supported: All.
-     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClTransposeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
deleted file mode 100644
index e3629f7706..0000000000
--- a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-using namespace misc::shape_calculator;
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-ClWeightsReshapeKernel::ClWeightsReshapeKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
-    auto padding_info = get_padding_info({ src, biases, dst });
-
-    const DataType data_type = src->data_type();
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
-    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
-
-    // Configure window
-    Window win = calculate_max_window(*src, Steps());
-    ICLKernel::configure_internal(win);
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
-    return Status{};
-}
-
-void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window out_window;
-    out_window.use_tensor_dimensions(dst->info()->tensor_shape());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    Window biases_window;
-    Window biases_slice;
-
-    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
-    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
-    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(0));
-    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(1));
-    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(2));
-    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
-    _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
-
-    if(biases != nullptr)
-    {
-        biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
-        biases_slice = biases_window.first_slice_window_1D();
-    }
-
-    do
-    {
-        // Set arguments
-        unsigned idx = 0;
-        add_3D_tensor_argument(idx, src, in_slice);
-        add_2D_tensor_argument(idx, dst, out_slice);
-        if(biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, biases, biases_slice);
-            ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
-        }
-
-        // Run kernel
-        enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h
deleted file mode 100644
index de2f2d10cc..0000000000
--- a/src/core/gpu/cl/kernels/ClWeightsReshapeKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
-#define ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref opencl::kernels::ClIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class ClWeightsReshapeKernel : public IClKernel
-{
-public:
-    ClWeightsReshapeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWeightsReshapeKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases          The shared biases tensor info to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] dst             The output tensor info. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                             Data types supported: Same as @p input
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWeightsReshapeKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
deleted file mode 100644
index 8607620e92..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
-    return Status{};
-}
-
-ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
-
-    auto padding_info = get_padding_info({ src1, src2, dst });
-
-    const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x2_";
-    _config_id += lower_string(string_from_data_type(src1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-}
-
-void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    Window slice = window.first_slice_window_4D();
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index 15e0757aec..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The src1 and src2 tensors will be concatenated into the dst tensor.
- */
-class ClWidthConcatenate2TensorsKernel : public IClKernel
-{
-public:
-    ClWidthConcatenate2TensorsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
-    /** Initialise the kernel's sources and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: All.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
-     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClWidthConcatenate2TensorsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
deleted file mode 100644
index edbc23c1d3..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/helpers/tensor_info.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
-        ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
-    return Status{};
-}
-
-void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
-
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
-    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0)));
-    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0)));
-    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-
-    // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
-    {
-        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
-        const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
-        const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
-        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
-        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
-        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
-        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
-        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
-        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
-
-    // Configure kernel window
-    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "concatenate_width_x4_";
-    _config_id += lower_string(string_from_data_type(src1->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src1->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src2->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src3->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src3->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src4->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src4->dimension(1));
-}
-
-void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    Window slice = window.first_slice_window_4D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src0, slice);
-        add_4D_tensor_argument(idx, src1, slice);
-        add_4D_tensor_argument(idx, src2, slice);
-        add_4D_tensor_argument(idx, src3, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 1e3f47f7fb..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All source tensors will be concatenated into the destination tensor.
- */
-class ClWidthConcatenate4TensorsKernel : public IClKernel
-{
-public:
-    ClWidthConcatenate4TensorsKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
-    /** Initialise the kernel's sources and destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src1            First source tensor info. Data types supported: All.
-     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
-     * @param[in]  src3            Third source tensor info. Data types supported: same as @p src1
-     * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
-     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
deleted file mode 100644
index 5510c746f8..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
-
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
-    }
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
-
-    return Status{};
-}
-} // namespace
-
-ClWidthConcatenateKernel::ClWidthConcatenateKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
-    return Status{};
-}
-
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2)));
-
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
-    }
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
-    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_4D_tensor_argument(idx, dst, window);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
deleted file mode 100644
index 300c4beb30..0000000000
--- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
-#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel.
- *  The source tensor will be concatenated into the destination tensor.
- */
-class ClWidthConcatenateKernel : public IClKernel
-{
-public:
-    ClWidthConcatenateKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
-    /** Initialise the kernel's source and destination
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor info. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClWidthConcatenateKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
deleted file mode 100644
index ae43fed12d..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(output);
-
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
-    const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
-
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-    return std::make_pair(Status{}, win_collapsed);
-}
-} // namespace
-
-ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
-{
-    _type = CLKernelType::WINOGRAD;
-}
-
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
-    const Size2D kernel_size      = winograd_info.kernel_size;
-    const Size2D output_tile_size = winograd_info.output_tile_size;
-
-    // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
-
-    return Status{};
-}
-
-void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Setup output window
-    Window window_out;
-    window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0);
-
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, src, window);
-    add_3D_tensor_argument(idx, dst, window_out);
-    enqueue(queue, *this, window, lws_hint());
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
deleted file mode 100644
index 145954fbb1..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the Winograd filter transform kernel. */
-class ClWinogradFilterTransformKernel : public IClKernel
-{
-public:
-    ClWinogradFilterTransformKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradFilterTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
deleted file mode 100644
index 538d8ae602..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
-
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_UNUSED(output_tile_size);
-    ARM_COMPUTE_UNUSED(kernel_size);
-
-    // Validate configured output
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    bool   window_changed = false;
-    Window win            = calculate_max_window(*input, Steps(1, 1));
-
-    if(input->data_layout() == DataLayout::NCHW)
-    {
-        const PadStrideInfo conv_info        = winograd_info.convolution_info;
-        const Size2D        output_tile_size = winograd_info.output_tile_size;
-        const Size2D        kernel_size      = winograd_info.kernel_size;
-
-        unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
-        unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
-
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
-        window_changed = update_window_and_padding(win, input_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ClWinogradInputTransformKernel::ClWinogradInputTransformKernel()
-{
-    _type = CLKernelType::WINOGRAD;
-}
-
-BorderSize ClWinogradInputTransformKernel::border_size() const
-{
-    return _border_size;
-}
-
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-
-    auto padding_info = get_padding_info({ src, dst });
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-
-    _data_layout = src->data_layout();
-
-    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    _num_tiles_x = num_tiles.width;
-    _num_tiles_y = num_tiles.height;
-
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1)));
-    const size_t total_batches = src->tensor_shape().total_size_upper(3);
-
-    CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
-    {
-        build_opts.add_option("-DNHWC");
-        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_w)));
-        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_h)));
-        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
-        build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
-        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
-        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-    }
-    else
-    {
-        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
-        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
-        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    }
-
-    // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
-
-    // Get the maximum dimension from the tile size
-    const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
-
-    // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
-    {
-        _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
-    }
-
-    // Append stepz and data layout
-    kernel_name += "_stepz";
-    kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
-
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Create window and update padding
-    auto win_config = validate_and_configure_window(src, dst, winograd_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
-    _border_size = BorderSize(src->padding());
-
-    ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
-
-    _config_id = kernel_name;
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_left());
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_info.pad_top());
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(_data_layout));
-}
-
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
-    return Status{};
-}
-
-void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const size_t total_batches = window.shape().total_size_upper(3);
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    if(_data_layout == DataLayout::NHWC)
-    {
-        Window slice = window_collapsed.first_slice_window_3D();
-        slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
-        slice.set(2, Window::Dimension(0, total_batches, 1));
-
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    else
-    {
-        Window slice = window_collapsed.first_slice_window_3D();
-        slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
-        slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
-
-        ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
-        slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
-
-        unsigned int idx = 2 * num_arguments_per_3D_tensor();
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[3]));
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, src, slice);
-            add_3D_tensor_argument(idx, dst, slice);
-
-            enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
-    }
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h
deleted file mode 100644
index 40fc2f387a..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** OpenCL kernel to perform Winograd input transform.*/
-class ClWinogradInputTransformKernel : public IClKernel
-{
-public:
-    ClWinogradInputTransformKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel);
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] src             The input tensor info to transform. Data types supported: F16/F32
-     * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradInputTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    BorderSize   _border_size{ 0 };
-    DataLayout   _data_layout{ DataLayout::UNKNOWN };
-    int          _num_tiles_x{ 0 };
-    int          _num_tiles_y{ 0 };
-    unsigned int _step_z{ 1 };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
deleted file mode 100644
index f6ade57e5d..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/Cast.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
-
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
-
-    // Compute number of elements to process in the X and Y direction
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(bias);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
-
-    if(output->data_layout() == DataLayout::NCHW)
-    {
-        const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
-        const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
-
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-        AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
-{
-    _type = CLKernelType::WINOGRAD;
-}
-
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
-                                                const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IClKernel::configure_internal(win_config.second);
-
-    auto padding_info = get_padding_info({ src, bias, dst });
-
-    _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
-
-    // Compute num_tiles_x
-    const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const Size2D        kernel_size      = winograd_info.kernel_size;
-    const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
-
-    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
-    const size_t total_batches = dst->tensor_shape().total_size_upper(3);
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
-    {
-        build_opts.add_option("-DVEC_SIZE=2");
-    }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
-    {
-        build_opts.add_option("-DVEC_SIZE=4");
-    }
-
-    build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
-    build_opts.add_option("-cl-fast-relaxed-math");
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
-    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
-    build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
-    build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
-    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width)));
-    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
-    build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
-    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
-    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
-
-    // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(src->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(dst->dimension(1));
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
-
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
-}
-
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
-    return Status{};
-}
-
-void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
-
-    auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    // Collapse window
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
-
-    // Get initial windows
-    Window slice = window_collapsed.first_slice_window_4D();
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // Setup output slice
-    Window slice_out(slice);
-    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    if(bias != nullptr)
-    {
-        unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(bias->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, bias, slice_biases);
-    }
-
-    if(_is_nhwc)
-    {
-        unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
-        _kernel.setArg(idx2, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_4D_tensor_argument(idx, src, slice);
-        add_4D_tensor_argument(idx, dst, slice_out);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
deleted file mode 100644
index 22b7f079c1..0000000000
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
-#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/gpu/cl/ClCompileContext.h"
-#include "src/core/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-/** Interface for the Winograd output transform kernel. */
-class ClWinogradOutputTransformKernel : public IClKernel
-{
-public:
-    ClWinogradOutputTransformKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias            Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src
-     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to ClWinogradOutputTransformKernel::configure()
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    bool _is_nhwc{ false };
-};
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
deleted file mode 100644
index 7866ccb679..0000000000
--- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
-{
-    ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
-    v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
-    h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
-
-    const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
-    const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
-    const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
-    const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
-    {
-        return info_img;
-    }
-    else
-    {
-        return info_buf;
-    }
-}
-
-void update_padding_for_cl_image(ITensorInfo *tensor)
-{
-    constexpr unsigned int num_floats_per_pixel = 4;
-
-    const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
-    const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
-
-    ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
-    {
-        return;
-    }
-
-    const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
-
-    tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
-}
-
-Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
-{
-    if(rhs_info.export_to_cl_image)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
-
-        // Check the width and height of the output tensor.
-        // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
-        const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-        const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
-    }
-
-    return Status{};
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h
deleted file mode 100644
index 3fce8c9173..0000000000
--- a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H
-#define ARM_COMPUTE_CL_GEMM_HELPERS_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * @param[in] m                  Number of rows (M) in the LHS matrix not reshaped
- * @param[in] n                  Number of columns (N) in the RHS matrix not reshaped
- * @param[in] m0                 Number of rows processed by each thread/work-item
- * @param[in] n0                 Number of columns processed by each thread/work-item
- * @param[in] k0                 Number of inner accumulation performed by each thread/work-item
- * @param[in] v0                 Number of vertical blocks of size (m0xk0) stored on the same output row
- * @param[in] h0                 Number of horizontal blocks of size (k0xn0) stored on the same output row
- * @param[in] lhs_interleave     True if the v0 (m0xk0) blocks have to be interleaved in the output row
- * @param[in] rhs_interleave     True if the h0 (k0xn0) blocks have to be interleaved in the output row
- * @param[in] lhs_transpose      True if the (m0xk0) block has to be transposed before been stored
- * @param[in] rhs_transpose      True if the (k0xn0) block has to be transposed before been stored
- * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
-
-/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support,
- * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return
- * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support.
- *
- * @param[in] info_img  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support
- * @param[in] info_buf  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used
- * @param[in] n         Number of columns (N) in the RHS matrix not reshaped
- * @param[in] k         Number of rows (K) in the RHS matrix not reshaped
- * @param[in] b         Batch size
- * @param[in] data_type Data type
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
-
-/** Update padding required to export the OpenCL buffer to OpenCL image2d
- *
- * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
- */
-void update_padding_for_cl_image(ITensorInfo *tensor);
-
-/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
- *
- * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
- * @param[in] rhs_info             @ref GEMMRHSMatrixInfo
- *
- * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
- */
-Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */
diff --git a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
deleted file mode 100644
index a49836cfda..0000000000
--- a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
-#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-
-#include <array>
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Basic container for the OpenCL GEMM configuration functions */
-template <class T>
-class CLGEMMConfigArray
-{
-public:
-    /** Alias for F32 index */
-    static constexpr size_t DT_F32 = 0;
-    /** Alias for F16 index */
-    static constexpr size_t DT_F16 = 1;
-    /** Alias for Int8 index */
-    static constexpr size_t DT_INT8 = 2;
-
-    /** Constructor
-     *
-     * @param[in] func_f32  Function to call for GEMM F32
-     * @param[in] func_f16  Function to call for GEMM F16
-     * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
-     *
-     */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
-    {
-    }
-
-    /** Method to return the GEMM configuration function based on data type
-     *
-     * @param[in] data_type Input data type
-     *
-     * @return the valid function otherwise it returns nullptr if the data type is not valid
-     */
-    T get_function(DataType data_type)
-    {
-        switch(data_type)
-        {
-            case DataType::F32:
-                return _configs.at(DT_F32);
-            case DataType::F16:
-                return _configs.at(DT_F16);
-            case DataType::QASYMM8:
-            case DataType::QASYMM8_SIGNED:
-            case DataType::QSYMM8_PER_CHANNEL:
-                return _configs.at(DT_INT8);
-            default:
-                return nullptr;
-        }
-    }
-
-private:
-    std::array<T, 3> _configs;
-};
-
-/** Basic interface for the GEMM kernel configuration */
-class IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] arch GPU target
-     */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
-    {
-    }
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
-    /** Virtual destructor */
-    virtual ~IClGemmKernelConfig() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
-     *
-     * @param[in] m         Number of rows LHS matrix
-     * @param[in] n         Number of columns RHS matrix
-     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
-     * @param[in] b         Batch size
-     * @param[in] data_type Data type
-     */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
-
-protected:
-    GPUTarget _target;
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
deleted file mode 100644
index 9d11006703..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G71:
-            func = configs_G71.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            if(n < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-            }
-            else if(n >= 2048 && n < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            if(m < 64)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        if(m == 1)
-        {
-            if(n < 8192)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n > 4196)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            if(k < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
-            }
-            else if(k >= 2048 && k < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 16384)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        if(m < 64)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
deleted file mode 100644
index 385b96e40e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
deleted file mode 100644
index e3c129e3be..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
-                                                                        &ClGemmDefaultConfigNativeMidgard::default_q8);
-
-    auto func = configs_default.get_function(data_type);
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    const unsigned int m0 = std::min(m, static_cast<unsigned int>(4));
-    const unsigned int n0 = std::min(n, static_cast<unsigned int>(4));
-
-    return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false);
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
deleted file mode 100644
index 0ff5471f7c..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Midgard based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
deleted file mode 100644
index 92767aca52..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
-
-    auto func = configs_default.get_function(data_type);
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n < 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
-        }
-        else if(n >= 2048 && n < 8192)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            if(n < 2048)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
-            }
-            else if(n >= 2048 && n < 16384)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            if(m < 64)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
-            }
-        }
-    }
-    else
-    {
-        if(m == 1)
-        {
-            if(n < 8192)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
-        }
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
deleted file mode 100644
index 17e4c9d339..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMNative configuration */
-class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
deleted file mode 100644
index ff6a0128af..0000000000
--- a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
-#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMNative factory class */
-class ClGemmNativeKernelConfigurationFactory final
-{
-public:
-    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMNative kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-                return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
deleted file mode 100644
index b030913a87..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G52:
-            func = configs_G52.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(workload <= 274.4000f)
-    {
-        if(r_nk <= 0.7461f)
-        {
-            if(r_mn <= 21.1667f)
-            {
-                return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-    else
-    {
-        if(r_mk <= 17.3926f)
-        {
-            if(workload <= 542.4000f)
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            if(r_nk <= 0.5463f)
-            {
-                if(workload <= 11767.6001f)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 323.4000f)
-    {
-        return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
-    {
-        // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
-    }
-    else
-    {
-        // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
-
-    // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = (n <= 4) ? false : true;
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-
-    if(workload <= 1595.2000f)
-    {
-        if(r_mk <= 2.1044f)
-        {
-            if(workload <= 870.4000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
-            }
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
deleted file mode 100644
index 52e6ce3f48..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMReshaped configuration */
-class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
deleted file mode 100644
index 57e42c92b3..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G78:
-            func = configs_G78.get_function(data_type);
-            break;
-        case GPUTarget::G77:
-        default:
-            func = configs_G77.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
-
-    if(r_mk <= 0.11824845522642136)
-    {
-        if(workload <= 880.0)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-        }
-        else
-        {
-            if(r_nk <= 0.42521367967128754)
-            {
-                if(workload <= 1726.4000244140625)
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-            else
-            {
-                if(workload <= 1241.6000366210938)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
-                }
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 11404.7998046875)
-        {
-            if(r_mk <= 1.0126488208770752)
-            {
-                if(r_mn <= 2.545312523841858)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
-                }
-            }
-            else
-            {
-                if(workload <= 2881.199951171875)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-        }
-        else
-        {
-            if(r_nk <= 0.5765306055545807)
-            {
-                if(r_mn <= 6.010416746139526)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 1288.0000f)
-    {
-        if(workload <= 505.6000f)
-        {
-            if(r_mn <= 0.4466f)
-            {
-                if(r_nk <= 0.2384f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
-                }
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
-            }
-        }
-        else
-        {
-            if(r_mn <= 0.2250f)
-            {
-                if(r_mn <= 0.1599f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                }
-            }
-            else
-            {
-                if(r_mk <= 0.7609f)
-                {
-                    if(r_mn <= 2.5453f)
-                    {
-                        if(workload <= 1089.6000f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                }
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 5434.4001f)
-        {
-            if(workload <= 1603.2000f)
-            {
-                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 0.6192f)
-                {
-                    if(r_mn <= 16.1016f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        if(workload <= 2750.0000f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            if(r_mk <= 6.3151f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(r_mk <= 0.0387f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        if(r_mk <= 2.5859f)
-                        {
-                            if(r_mk <= 0.2734f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(r_mk <= 25.7500f)
-            {
-                if(r_mk <= 0.3615f)
-                {
-                    if(r_mn <= 0.0913f)
-                    {
-                        if(r_mk <= 0.0683f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    if(workload <= 11174.3999f)
-                    {
-                        if(r_mk <= 0.8047f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            if(workload <= 7185.5999f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(workload <= 17917.5000f)
-                        {
-                            if(r_mk <= 1.5078f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
-                            }
-                        }
-                        else
-                        {
-                            if(workload <= 34449.6016f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
-                            }
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_mk <= 331.1111f)
-                {
-                    if(workload <= 53397.5996f)
-                    {
-                        if(r_mn <= 57.8063f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                        }
-                    }
-                    else
-                    {
-                        if(r_nk <= 0.9211f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                        }
-                    }
-                }
-                else
-                {
-                    if(workload <= 38070.4004f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(workload <= 801.6000f)
-    {
-        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-    }
-    else
-    {
-        if(r_mn <= 0.1211f)
-        {
-            if(workload <= 3296.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 1.0625f)
-                {
-                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 5068.8000f)
-            {
-                return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-            }
-            else
-            {
-                if(r_nk <= 0.2361f)
-                {
-                    if(workload <= 12630.0000f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1);
-                    }
-                }
-                else
-                {
-                    if(workload <= 178790.3984f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
-                    }
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1);
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
deleted file mode 100644
index 588cd64e0e..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMReshaped configuration */
-class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
deleted file mode 100644
index c990c89a91..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMReshaped factory class */
-class ClGemmReshapedKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshaped kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
deleted file mode 100644
index 417d540468..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
-    {
-        case GPUTarget::G76:
-            func = configs_G76.get_function(data_type);
-            break;
-        case GPUTarget::G51:
-            func = configs_G51.get_function(data_type);
-            break;
-        case GPUTarget::G52:
-            func = configs_G52.get_function(data_type);
-            break;
-        case GPUTarget::G31:
-            func = configs_G31.get_function(data_type);
-            break;
-        default:
-            func = configs_G7x.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n <= 2548)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
-
-    if(m == 1)
-    {
-        if(n >= 8192)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
-            }
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
-        }
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
-
-    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
-
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        if(r_nk <= 0.4664f)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-    else
-    {
-        if(workload <= 274.4000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
-
-            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        if(n > 2048)
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    if(m == 1)
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
-
-        if(r_mk <= 0.0026f)
-        {
-            if(r_nk <= 0.4664f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.0148f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-            }
-            else
-            {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
-
-        if(workload <= 362.6000f)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-        }
-        else
-        {
-            if(r_mn <= 22.6067f)
-            {
-                if(workload <= 708.8000f)
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0917f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
-                }
-                else
-                {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-
-    if(m == 1)
-    {
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-        if(workload <= 7449.60f)
-        {
-            if(workload <= 691.60f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
-            }
-            else
-            {
-                if(workload <= 4155.20f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 16300.80f)
-            {
-                if(r_mn <= 44.56f)
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int n0 = n < 1280 ? 2 : 4;
-        const unsigned int h0 = std::max(n / n0, 1U);
-        return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(m == 1)
-        {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
-    }
-}
-
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
deleted file mode 100644
index 98c8e53569..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
-class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
deleted file mode 100644
index 4c6e633896..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-using namespace arm_compute::misc::shape_calculator;
-
-ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
-{
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
-
-    ConfigurationFunctionExecutorPtr func = nullptr;
-
-    switch(_target)
-    {
-        case GPUTarget::G78:
-            func = configs_G78.get_function(data_type);
-            break;
-        case GPUTarget::G77:
-        default:
-            func = configs_G77.get_function(data_type);
-            break;
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
-    return (this->*func)(m, n, k, b);
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    if(m == 1)
-    {
-        const float r_mn = static_cast<float>(m) / static_cast<float>(n);
-        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
-
-        if(r_mk <= 0.0064484127797186375)
-        {
-            if(r_mn <= 0.0028273810748942196)
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-
-                const unsigned int h0 = std::max(n / 4, 1U);
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0);
-            }
-        }
-        else
-        {
-            if(r_mk <= 0.020312500186264515)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-        const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-
-        if(workload <= 1999.2000122070312)
-        {
-            if(workload <= 747.1999816894531)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-        else
-        {
-            if(r_mn <= 0.03348214365541935)
-            {
-                if(r_mk <= 0.028125000186264515)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    GEMMLHSMatrixInfo lhs_info_buf;
-                    GEMMRHSMatrixInfo rhs_info_buf;
-                    GEMMLHSMatrixInfo lhs_info_img;
-                    GEMMRHSMatrixInfo rhs_info_img;
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
-
-                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
-                }
-            }
-            else
-            {
-                GEMMLHSMatrixInfo lhs_info_buf;
-                GEMMRHSMatrixInfo rhs_info_buf;
-                GEMMLHSMatrixInfo lhs_info_img;
-                GEMMRHSMatrixInfo rhs_info_img;
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
-
-                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        if(n <= 836.0)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0);
-        }
-    }
-    else if(m < 128)
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(k >= 512)
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-        }
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(n >= 64)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0);
-        }
-        else
-        {
-            if(k >= 512)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0);
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(m == 1)
-    {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
-    }
-    else
-    {
-        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
-        {
-            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(workload <= 278.7000f)
-        {
-            if(workload <= 7.5000f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-            }
-            else
-            {
-                if(r_mn <= 0.0031f)
-                {
-                    if(workload <= 256.6000f)
-                    {
-                        if(workload <= 16.7500f)
-                        {
-                            if(r_nk <= 1.6671f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mk <= 0.0027f)
-                    {
-                        if(r_mk <= 0.0014f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                        }
-                        else
-                        {
-                            if(workload <= 8.9500f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(workload <= 14.1500f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 0.0041f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 363.7000f)
-            {
-                if(r_mk <= 0.0031f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0);
-                }
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1384.8000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-            }
-        }
-        else
-        {
-            if(workload <= 16761.6006f)
-            {
-                if(r_mn <= 187.1250f)
-                {
-                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
-                }
-            }
-            else
-            {
-                if(r_mk <= 432.4630f)
-                {
-                    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1);
-                }
-            }
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
-    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
-    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
-    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
-
-    if(m == 1)
-    {
-        if(r_mn <= 0.0038f)
-        {
-            if(workload <= 353.9000f)
-            {
-                if(workload <= 278.7000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    if(r_mk <= 0.0004f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 0.0030f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 1.9384f)
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-                }
-                else
-                {
-                    return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1);
-                }
-            }
-        }
-        else
-        {
-            if(r_nk <= 1.0368f)
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0);
-            }
-        }
-    }
-    else
-    {
-        if(workload <= 1422.4000f)
-        {
-            if(workload <= 704.0000f)
-            {
-                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
-            }
-            else
-            {
-                if(workload <= 1197.6000f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(workload <= 1241.6000f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            if(workload <= 2769.6000f)
-            {
-                if(workload <= 1846.4000f)
-                {
-                    if(r_mn <= 2.4927f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                }
-                else
-                {
-                    if(r_mn <= 0.6261f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(r_mk <= 3.4453f)
-                        {
-                            if(r_mn <= 1.4135f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                        }
-                        else
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                    }
-                }
-            }
-            else
-            {
-                if(r_nk <= 0.0302f)
-                {
-                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1);
-                }
-                else
-                {
-                    if(r_mk <= 181.3750f)
-                    {
-                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                    }
-                    else
-                    {
-                        if(workload <= 28035.2002f)
-                        {
-                            return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                        }
-                        else
-                        {
-                            if(r_mk <= 808.6667f)
-                            {
-                                return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
-                            }
-                            else
-                            {
-                                return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
deleted file mode 100644
index 6a11ddb748..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
-#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
-class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */
diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
deleted file mode 100644
index 8fd71276a0..0000000000
--- a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
-#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H
-
-#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
-#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace opencl
-{
-namespace kernels
-{
-namespace gemm
-{
-/** CLGEMMReshapedOnlyRHS factory class */
-class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshapedOnlyRHS kernel configuration class
-     */
-    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace gemm
-} // namespace kernels
-} // namespace opencl
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */
diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
index e682973827..b1aee64d5d 100644
--- a/src/core/utils/AssemblyUtils.h
+++ b/src/core/utils/AssemblyUtils.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "src/core/NEON/kernels/assembly/common.hpp"
-#include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
 
 namespace arm_compute
 {
-- 
cgit v1.2.1