From 68dd25fbe6e4d3c3513fa5993863419769aa08fc Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Mon, 19 Oct 2020 16:00:11 +0100
Subject: COMPMID-3637: Move utility headers from arm_compute to src

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: If9d6fa8c900b68c4b6fd373f2fc1f9abb83ea917
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4145
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/AccessWindowAutoPadding.cpp               |   4 +-
 src/core/AccessWindowAutoPadding.h                 |  85 +++
 src/core/AccessWindowStatic.cpp                    |   4 +-
 src/core/AccessWindowStatic.h                      | 101 +++
 src/core/AccessWindowTranspose.cpp                 |   4 +-
 src/core/AccessWindowTranspose.h                   |  48 ++
 src/core/CL/CLValidate.h                           |  61 ++
 src/core/CL/ICLGEMMKernelConfiguration.h           |  68 ++
 src/core/CL/ICLKernel.cpp                          |   9 +-
 src/core/CL/ICLSimple2DKernel.cpp                  |   7 +-
 src/core/CL/ICLSimpleKernel.cpp                    |   3 +-
 src/core/CL/gemm/CLGEMMHelpers.cpp                 |   2 +-
 src/core/CL/gemm/CLGEMMHelpers.h                   |  73 ++
 .../gemm/native/CLGEMMNativeKernelConfiguration.h  |  65 ++
 .../CLGEMMNativeKernelConfigurationBifrost.cpp     |   4 +-
 .../CLGEMMNativeKernelConfigurationBifrost.h       |  56 ++
 .../CLGEMMNativeKernelConfigurationMidgard.cpp     |   4 +-
 .../CLGEMMNativeKernelConfigurationMidgard.h       |  51 ++
 .../CLGEMMNativeKernelConfigurationValhall.cpp     |   4 +-
 .../CLGEMMNativeKernelConfigurationValhall.h       |  53 ++
 .../reshaped/CLGEMMReshapedKernelConfiguration.h   |  63 ++
 .../CLGEMMReshapedKernelConfigurationBifrost.cpp   |  30 +-
 .../CLGEMMReshapedKernelConfigurationBifrost.h     |  56 ++
 .../CLGEMMReshapedKernelConfigurationValhall.cpp   |   4 +-
 .../CLGEMMReshapedKernelConfigurationValhall.h     |  53 ++
 .../CLGEMMReshapedOnlyRHSKernelConfiguration.h     |  63 ++
 ...MMReshapedOnlyRHSKernelConfigurationBifrost.cpp |  12 +-
 ...GEMMReshapedOnlyRHSKernelConfigurationBifrost.h |  59 ++
 ...MMReshapedOnlyRHSKernelConfigurationValhall.cpp |   4 +-
 ...GEMMReshapedOnlyRHSKernelConfigurationValhall.h |  53 ++
 src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp |   5 +-
 src/core/CL/kernels/CLActivationLayerKernel.cpp    |  10 +-
 src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp     |   7 +-
 .../CL/kernels/CLBatchConcatenateLayerKernel.cpp   |   6 +-
 .../CL/kernels/CLBatchNormalizationLayerKernel.cpp |   5 +-
 src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp  |   4 +-
 src/core/CL/kernels/CLBitwiseAndKernel.cpp         |   3 +-
 src/core/CL/kernels/CLBitwiseOrKernel.cpp          |   3 +-
 src/core/CL/kernels/CLBitwiseXorKernel.cpp         |   3 +-
 .../CL/kernels/CLBoundingBoxTransformKernel.cpp    |   7 +-
 src/core/CL/kernels/CLBox3x3Kernel.cpp             |   3 +-
 src/core/CL/kernels/CLCannyEdgeKernel.cpp          |   1 +
 src/core/CL/kernels/CLChannelCombineKernel.cpp     |   4 +-
 src/core/CL/kernels/CLChannelExtractKernel.cpp     |   5 +-
 .../CL/kernels/CLChannelShuffleLayerKernel.cpp     |   6 +-
 src/core/CL/kernels/CLCol2ImKernel.cpp             |   6 +-
 src/core/CL/kernels/CLColorConvertKernel.cpp       |   4 +-
 src/core/CL/kernels/CLComparisonKernel.cpp         |   4 +-
 .../CLConvertFullyConnectedWeightsKernel.cpp       |   5 +-
 src/core/CL/kernels/CLConvolutionKernel.cpp        |   2 +-
 src/core/CL/kernels/CLCopyKernel.cpp               |   6 +-
 src/core/CL/kernels/CLCropKernel.cpp               |   9 +-
 .../kernels/CLDeconvolutionLayerUpsampleKernel.cpp |   5 +-
 .../kernels/CLDeconvolutionReshapeOutputKernel.cpp |   2 +
 .../CL/kernels/CLDepthConcatenateLayerKernel.cpp   |   6 +-
 src/core/CL/kernels/CLDepthConvertLayerKernel.cpp  |   4 +-
 src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp  |   4 +-
 .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp   |   8 +-
 .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp   |   8 +-
 .../CLDepthwiseConvolutionLayerNativeKernel.cpp    |   7 +-
 ...pthwiseConvolutionLayerReshapeWeightsKernel.cpp |   9 +-
 .../CL/kernels/CLDequantizationLayerKernel.cpp     |   7 +-
 src/core/CL/kernels/CLDerivativeKernel.cpp         |   4 +-
 src/core/CL/kernels/CLDilateKernel.cpp             |   2 +-
 .../CL/kernels/CLDirectConvolutionLayerKernel.cpp  |   9 +-
 .../CL/kernels/CLElementWiseUnaryLayerKernel.cpp   |   5 +-
 .../CL/kernels/CLElementwiseOperationKernel.cpp    |   6 +-
 src/core/CL/kernels/CLErodeKernel.cpp              |   2 +-
 src/core/CL/kernels/CLFFTDigitReverseKernel.cpp    |   6 +-
 src/core/CL/kernels/CLFFTRadixStageKernel.cpp      |   6 +-
 src/core/CL/kernels/CLFFTScaleKernel.cpp           |   6 +-
 src/core/CL/kernels/CLFastCornersKernel.cpp        |   4 +-
 src/core/CL/kernels/CLFillBorderKernel.cpp         |   6 +-
 src/core/CL/kernels/CLFlattenLayerKernel.cpp       |   3 +-
 src/core/CL/kernels/CLFloorKernel.cpp              |   6 +-
 .../CL/kernels/CLFuseBatchNormalizationKernel.cpp  |   5 +-
 .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp       |   9 +-
 .../CLGEMMLowpMatrixMultiplyReshapedKernel.cpp     |   7 +-
 ...GEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp |   7 +-
 .../kernels/CLGEMMLowpOffsetContributionKernel.cpp |   9 +-
 ...GEMMLowpOffsetContributionOutputStageKernel.cpp |   7 +-
 ...owpQuantizeDownInt32ScaleByFixedPointKernel.cpp |   6 +-
 ...GEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp |   6 +-
 .../CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp     |   6 +-
 src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp  |   4 +-
 src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp |  14 +-
 .../kernels/CLGEMMMatrixMultiplyNativeKernel.cpp   |   9 +-
 .../kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp |  13 +-
 .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp  |  11 +-
 .../kernels/CLGEMMMatrixVectorMultiplyKernel.cpp   |   7 +-
 .../CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp    |   9 +-
 .../CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp    |  11 +-
 src/core/CL/kernels/CLGatherKernel.cpp             |   3 +-
 src/core/CL/kernels/CLGaussian3x3Kernel.cpp        |   2 +-
 src/core/CL/kernels/CLGaussianPyramidKernel.cpp    |   1 +
 .../CL/kernels/CLGenerateProposalsLayerKernel.cpp  |   7 +-
 src/core/CL/kernels/CLHOGDescriptorKernel.cpp      |   4 +-
 src/core/CL/kernels/CLHOGDetectorKernel.cpp        |   4 +-
 src/core/CL/kernels/CLHarrisCornersKernel.cpp      |   7 +-
 .../CL/kernels/CLHeightConcatenateLayerKernel.cpp  |   6 +-
 src/core/CL/kernels/CLHistogramKernel.cpp          |   4 +-
 src/core/CL/kernels/CLIm2ColKernel.cpp             |   8 +-
 .../kernels/CLInstanceNormalizationLayerKernel.cpp |   5 +-
 src/core/CL/kernels/CLIntegralImageKernel.cpp      |   3 +-
 src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp   |   5 +-
 src/core/CL/kernels/CLLKTrackerKernel.cpp          |   5 +-
 .../CLLocallyConnectedMatrixMultiplyKernel.cpp     |   8 +-
 src/core/CL/kernels/CLMagnitudePhaseKernel.cpp     |   3 +-
 src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp  |   9 +-
 src/core/CL/kernels/CLMeanStdDevKernel.cpp         |   6 +-
 .../CL/kernels/CLMeanStdDevNormalizationKernel.cpp |   7 +-
 src/core/CL/kernels/CLMedian3x3Kernel.cpp          |   2 +-
 src/core/CL/kernels/CLMemsetKernel.cpp             |   2 +-
 src/core/CL/kernels/CLMinMaxLayerKernel.cpp        |   5 +-
 src/core/CL/kernels/CLMinMaxLocationKernel.cpp     |   3 +-
 src/core/CL/kernels/CLNonLinearFilterKernel.cpp    |   3 +-
 .../CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp |   1 +
 src/core/CL/kernels/CLNormalizationLayerKernel.cpp |   7 +-
 .../CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp |   7 +-
 src/core/CL/kernels/CLPadLayerKernel.cpp           |   2 +
 src/core/CL/kernels/CLPermuteKernel.cpp            |   3 +-
 .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp |   6 +-
 src/core/CL/kernels/CLPoolingLayerKernel.cpp       |   7 +-
 src/core/CL/kernels/CLPriorBoxLayerKernel.cpp      |   5 +-
 .../CL/kernels/CLQLSTMLayerNormalizationKernel.cpp |   3 +-
 src/core/CL/kernels/CLQuantizationLayerKernel.cpp  |   6 +-
 src/core/CL/kernels/CLROIAlignLayerKernel.cpp      |   7 +-
 src/core/CL/kernels/CLROIPoolingLayerKernel.cpp    |   7 +-
 src/core/CL/kernels/CLRangeKernel.cpp              |   4 +-
 src/core/CL/kernels/CLReductionOperationKernel.cpp |   7 +-
 src/core/CL/kernels/CLRemapKernel.cpp              |   5 +-
 src/core/CL/kernels/CLReorgLayerKernel.cpp         |   4 +-
 src/core/CL/kernels/CLReshapeLayerKernel.cpp       |   8 +-
 src/core/CL/kernels/CLReverseKernel.cpp            |   6 +-
 src/core/CL/kernels/CLScaleKernel.cpp              |   5 +-
 src/core/CL/kernels/CLScharr3x3Kernel.cpp          |   4 +-
 src/core/CL/kernels/CLSelectKernel.cpp             |   5 +-
 src/core/CL/kernels/CLSobel3x3Kernel.cpp           |   4 +-
 src/core/CL/kernels/CLSobel5x5Kernel.cpp           |   4 +-
 src/core/CL/kernels/CLSobel7x7Kernel.cpp           |   4 +-
 src/core/CL/kernels/CLSoftmaxLayerKernel.cpp       |   7 +-
 src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp  |   4 +-
 src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp  |   4 +-
 src/core/CL/kernels/CLStackLayerKernel.cpp         |   5 +-
 src/core/CL/kernels/CLStridedSliceKernel.cpp       |   6 +-
 src/core/CL/kernels/CLTileKernel.cpp               |   3 +-
 src/core/CL/kernels/CLTransposeKernel.cpp          |  10 +-
 src/core/CL/kernels/CLUpsampleLayerKernel.cpp      |   6 +-
 src/core/CL/kernels/CLWarpAffineKernel.cpp         |   3 +-
 src/core/CL/kernels/CLWarpPerspectiveKernel.cpp    |   3 +-
 src/core/CL/kernels/CLWeightsReshapeKernel.cpp     |   2 +
 .../kernels/CLWidthConcatenate2TensorsKernel.cpp   |  10 +-
 .../kernels/CLWidthConcatenate4TensorsKernel.cpp   |  11 +-
 .../CL/kernels/CLWidthConcatenateLayerKernel.cpp   |   7 +-
 .../CL/kernels/CLWinogradFilterTransformKernel.cpp |   6 +-
 .../CL/kernels/CLWinogradInputTransformKernel.cpp  |   6 +-
 .../CL/kernels/CLWinogradOutputTransformKernel.cpp |   6 +-
 src/core/CL/kernels/CLYOLOLayerKernel.cpp          |   8 +-
 src/core/CPP/ICPPSimpleKernel.cpp                  |   5 +-
 src/core/CPP/Validate.h                            | 117 ++++
 .../CPPBoxWithNonMaximaSuppressionLimitKernel.cpp  |   4 +-
 src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp |  12 +-
 ...PPDetectionWindowNonMaximaSuppressionKernel.cpp |   3 +-
 .../CPP/kernels/CPPNonMaximumSuppressionKernel.cpp |   4 +-
 src/core/CPP/kernels/CPPPermuteKernel.cpp          |   7 +-
 src/core/CPP/kernels/CPPTopKVKernel.cpp            |  10 +-
 src/core/CPP/kernels/CPPUpsampleKernel.cpp         |   7 +-
 src/core/GLES_COMPUTE/IGCSimpleKernel.cpp          |   3 +-
 .../kernels/GCAbsoluteDifferenceKernel.cpp         |   1 +
 .../kernels/GCActivationLayerKernel.cpp            |   2 +
 .../kernels/GCArithmeticAdditionKernel.cpp         |   2 +
 .../kernels/GCBatchNormalizationLayerKernel.cpp    |   4 +-
 src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp   |   6 +-
 .../kernels/GCDepthConcatenateLayerKernel.cpp      |   2 +
 .../GCDepthwiseConvolutionLayer3x3Kernel.cpp       |   4 +-
 .../kernels/GCDirectConvolutionLayerKernel.cpp     |  22 +-
 .../GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp  |   2 +
 .../GLES_COMPUTE/kernels/GCFillBorderKernel.cpp    |   2 +
 .../kernels/GCGEMMInterleave4x4Kernel.cpp          |   2 +
 .../kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp |   4 +-
 .../kernels/GCGEMMMatrixAdditionKernel.cpp         |   2 +
 .../kernels/GCGEMMMatrixMultiplyKernel.cpp         |   6 +-
 .../kernels/GCGEMMTranspose1xWKernel.cpp           |   4 +-
 src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp   |  12 +-
 .../kernels/GCNormalizationLayerKernel.cpp         |   2 +
 .../kernels/GCNormalizePlanarYUVLayerKernel.cpp    |   4 +-
 .../kernels/GCPixelWiseMultiplicationKernel.cpp    |   2 +
 .../GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp  |   4 +-
 src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp    |   4 +-
 .../GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp  |   4 +-
 .../GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp   |   4 +-
 .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp     |   4 +-
 .../kernels/GCWeightsReshapeKernel.cpp             |   2 +
 src/core/Helpers.cpp                               | 171 -----
 src/core/NEON/NETracePoint.cpp                     |   4 +-
 .../NEON/kernels/NEAbsoluteDifferenceKernel.cpp    |   2 +
 src/core/NEON/kernels/NEAccumulateKernel.cpp       |   4 +-
 src/core/NEON/kernels/NEActivationLayerKernel.cpp  |   4 +-
 .../NEON/kernels/NEArithmeticAdditionKernel.cpp    |   4 +-
 .../NEON/kernels/NEArithmeticSubtractionKernel.cpp |   4 +-
 .../NEON/kernels/NEBatchConcatenateLayerKernel.cpp |   2 +
 .../kernels/NEBatchNormalizationLayerKernel.cpp    |   4 +-
 .../NEON/kernels/NEBatchToSpaceLayerKernel.cpp     |   2 +
 src/core/NEON/kernels/NEBitwiseAndKernel.cpp       |   2 +
 src/core/NEON/kernels/NEBitwiseNotKernel.cpp       |   4 +-
 src/core/NEON/kernels/NEBitwiseOrKernel.cpp        |   4 +-
 src/core/NEON/kernels/NEBitwiseXorKernel.cpp       |   4 +-
 .../NEON/kernels/NEBoundingBoxTransformKernel.cpp  |   6 +-
 src/core/NEON/kernels/NEBox3x3Kernel.cpp           |   5 +-
 src/core/NEON/kernels/NECannyEdgeKernel.cpp        |   8 +-
 src/core/NEON/kernels/NEChannelCombineKernel.cpp   |   4 +-
 src/core/NEON/kernels/NEChannelExtractKernel.cpp   |   4 +-
 .../NEON/kernels/NEChannelShuffleLayerKernel.cpp   |   6 +-
 src/core/NEON/kernels/NECol2ImKernel.cpp           |   4 +-
 src/core/NEON/kernels/NEColorConvertKernel.cpp     |   2 +
 .../NEConvertFullyConnectedWeightsKernel.cpp       |   4 +-
 .../kernels/NEConvertQuantizedSignednessKernel.cpp |   2 +
 src/core/NEON/kernels/NEConvolutionKernel.cpp      |   4 +-
 src/core/NEON/kernels/NECopyKernel.cpp             |   4 +-
 src/core/NEON/kernels/NECropKernel.cpp             |   9 +-
 .../kernels/NECumulativeDistributionKernel.cpp     |   4 +-
 .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp |   2 +
 .../NEON/kernels/NEDepthConvertLayerKernel.cpp     |   7 +-
 .../NEON/kernels/NEDepthToSpaceLayerKernel.cpp     |   3 +
 .../NEDepthwiseConvolutionLayerNativeKernel.cpp    |   7 +-
 .../NEON/kernels/NEDequantizationLayerKernel.cpp   |   6 +-
 src/core/NEON/kernels/NEDerivativeKernel.cpp       |   4 +-
 src/core/NEON/kernels/NEDilateKernel.cpp           |   2 +
 .../kernels/NEDirectConvolutionLayerKernel.cpp     |   6 +-
 .../NEDirectConvolutionLayerOutputStageKernel.cpp  |   6 +-
 .../NEON/kernels/NEElementwiseOperationKernel.cpp  |   4 +-
 src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp |   4 +-
 src/core/NEON/kernels/NEErodeKernel.cpp            |   2 +
 src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp  |   4 +-
 src/core/NEON/kernels/NEFFTRadixStageKernel.cpp    |   7 +-
 src/core/NEON/kernels/NEFFTScaleKernel.cpp         |   2 +
 src/core/NEON/kernels/NEFastCornersKernel.cpp      |   4 +-
 src/core/NEON/kernels/NEFillArrayKernel.cpp        |   1 +
 src/core/NEON/kernels/NEFillBorderKernel.cpp       |   3 +-
 src/core/NEON/kernels/NEFlattenLayerKernel.cpp     |   4 +-
 src/core/NEON/kernels/NEFloorKernel.cpp            |   4 +-
 .../kernels/NEFuseBatchNormalizationKernel.cpp     |   4 +-
 .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp     |   2 +
 .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp     |   6 +-
 .../kernels/NEGEMMLowpOffsetContributionKernel.cpp |   4 +-
 ...GEMMLowpOffsetContributionOutputStageKernel.cpp |   4 +-
 .../NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp     |   4 +-
 ...tizeDownInt32ToInt16ScaleByFixedPointKernel.cpp |   4 +-
 ...ntizeDownInt32ToInt8ScaleByFixedPointKernel.cpp |   4 +-
 ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp |   4 +-
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     |   4 +-
 .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp    |   4 +-
 .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp    |   7 +-
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp |   4 +-
 src/core/NEON/kernels/NEGatherKernel.cpp           |   6 +-
 src/core/NEON/kernels/NEGaussian3x3Kernel.cpp      |   4 +-
 src/core/NEON/kernels/NEGaussian5x5Kernel.cpp      |   4 +-
 src/core/NEON/kernels/NEGaussianPyramidKernel.cpp  |   4 +-
 .../kernels/NEGenerateProposalsLayerKernel.cpp     |   8 +-
 src/core/NEON/kernels/NEHOGDescriptorKernel.cpp    |   4 +-
 src/core/NEON/kernels/NEHOGDetectorKernel.cpp      |   4 +-
 src/core/NEON/kernels/NEHarrisCornersKernel.cpp    |   4 +-
 .../kernels/NEHeightConcatenateLayerKernel.cpp     |   2 +
 src/core/NEON/kernels/NEHistogramKernel.cpp        |   4 +-
 src/core/NEON/kernels/NEIm2ColKernel.cpp           |   4 +-
 .../kernels/NEInstanceNormalizationLayerKernel.cpp |   4 +-
 src/core/NEON/kernels/NEIntegralImageKernel.cpp    |   4 +-
 src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp |   2 +
 src/core/NEON/kernels/NELKTrackerKernel.cpp        |   6 +-
 .../NELocallyConnectedMatrixMultiplyKernel.cpp     |   4 +-
 src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp   |   4 +-
 .../NEON/kernels/NEMaxUnpoolingLayerKernel.cpp     |   4 +-
 src/core/NEON/kernels/NEMeanStdDevKernel.cpp       |   4 +-
 .../kernels/NEMeanStdDevNormalizationKernel.cpp    |   4 +-
 src/core/NEON/kernels/NEMedian3x3Kernel.cpp        |   4 +-
 src/core/NEON/kernels/NEMemsetKernel.cpp           |   6 +-
 src/core/NEON/kernels/NEMinMaxLayerKernel.cpp      |   4 +-
 src/core/NEON/kernels/NEMinMaxLocationKernel.cpp   |   4 +-
 src/core/NEON/kernels/NENonLinearFilterKernel.cpp  |   4 +-
 .../kernels/NENonMaximaSuppression3x3Kernel.cpp    |   4 +-
 .../NEON/kernels/NENormalizationLayerKernel.cpp    |   7 +-
 src/core/NEON/kernels/NEPadLayerKernel.cpp         |   2 +
 src/core/NEON/kernels/NEPermuteKernel.cpp          |   4 +-
 .../kernels/NEPixelWiseMultiplicationKernel.cpp    |   4 +-
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     |   6 +-
 src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp    |   2 +
 .../kernels/NEQLSTMLayerNormalizationKernel.cpp    |   4 +-
 .../NEON/kernels/NEQuantizationLayerKernel.cpp     |   4 +-
 src/core/NEON/kernels/NEROIAlignLayerKernel.cpp    |   6 +-
 src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp  |   4 +-
 src/core/NEON/kernels/NERangeKernel.cpp            |   2 +
 .../NEON/kernels/NEReductionOperationKernel.cpp    |   6 +-
 src/core/NEON/kernels/NERemapKernel.cpp            |   9 +-
 src/core/NEON/kernels/NEReorgLayerKernel.cpp       |   4 +-
 src/core/NEON/kernels/NEReshapeLayerKernel.cpp     |   6 +-
 src/core/NEON/kernels/NEReverseKernel.cpp          |   2 +
 src/core/NEON/kernels/NEScaleKernel.cpp            |  12 +-
 src/core/NEON/kernels/NEScharr3x3Kernel.cpp        |   4 +-
 src/core/NEON/kernels/NESelectKernel.cpp           |   4 +-
 src/core/NEON/kernels/NESobel3x3Kernel.cpp         |   4 +-
 src/core/NEON/kernels/NESobel5x5Kernel.cpp         |   4 +-
 src/core/NEON/kernels/NESobel7x7Kernel.cpp         |   4 +-
 src/core/NEON/kernels/NESoftmaxLayerKernel.cpp     |   8 +-
 .../NEON/kernels/NESpaceToBatchLayerKernel.cpp     |   3 +
 .../NEON/kernels/NESpaceToDepthLayerKernel.cpp     |   3 +
 src/core/NEON/kernels/NEStackLayerKernel.cpp       |   2 +
 src/core/NEON/kernels/NEStridedSliceKernel.cpp     |   9 +-
 src/core/NEON/kernels/NEThresholdKernel.cpp        |   2 +
 src/core/NEON/kernels/NETileKernel.cpp             |   4 +-
 src/core/NEON/kernels/NETransposeKernel.cpp        |   6 +-
 src/core/NEON/kernels/NEUpsampleLayerKernel.cpp    |   4 +-
 src/core/NEON/kernels/NEWarpKernel.cpp             |  19 +-
 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp   |   2 +
 .../NEON/kernels/NEWidthConcatenateLayerKernel.cpp |   2 +
 .../kernels/NEWinogradConvolutionLayerKernel.cpp   |   6 +-
 .../kernels/NEWinogradConvolutionLayerKernel.h     |   4 +-
 src/core/NEON/kernels/NEYOLOLayerKernel.cpp        |   4 +-
 .../NEON/kernels/assembly/INEGEMMWrapperKernel.cpp |   4 +-
 .../NEON/kernels/assembly/INEGEMMWrapperKernel.h   | 108 +++
 .../NEDepthwiseConvolutionAssemblyKernelWrapper.h  |  88 +++
 src/core/NEON/kernels/assembly/arm_gemm_local.hpp  |  34 +
 .../NEON/kernels/convolution/common/activation.hpp |  37 +
 src/core/NEON/kernels/convolution/common/alloc.hpp |  31 +
 src/core/NEON/kernels/convolution/common/arm.hpp   |  39 ++
 .../kernels/convolution/common/convolution.hpp     |  29 +
 .../NEON/kernels/convolution/common/padding.hpp    |  91 +++
 src/core/NEON/kernels/convolution/common/perf.h    |  32 +
 .../NEON/kernels/convolution/common/qasymm8.hpp    |  54 ++
 .../NEON/kernels/convolution/common/qsymm8.hpp     |  76 +++
 src/core/NEON/kernels/convolution/common/shims.hpp | 749 +++++++++++++++++++++
 .../NEON/kernels/convolution/common/tensor.hpp     | 178 +++++
 .../kernels/convolution/common/tensor_utils.hpp    |  46 ++
 src/core/NEON/kernels/convolution/common/utils.hpp |  60 ++
 .../kernels/convolution/depthwise/depthwise.hpp    | 551 +++++++++++++++
 .../convolution/depthwise/depthwise_dilated.hpp    | 156 +++++
 .../convolution/depthwise/depthwise_quantized.hpp  | 291 ++++++++
 .../depthwise/depthwise_quantized_dilated.hpp      |  88 +++
 .../kernels/detail/NEDirectConvolutionDetail.h     |   4 +-
 src/core/TensorInfo.cpp                            |   1 +
 src/core/helpers/AutoConfiguration.h               | 176 +++++
 src/core/helpers/NormalizationHelpers.h            |  47 ++
 src/core/helpers/ScaleHelpers.h                    | 331 +++++++++
 src/core/helpers/SoftmaxHelpers.cpp                |  45 ++
 src/core/helpers/SoftmaxHelpers.h                  |  50 ++
 src/core/helpers/Utils.h                           |  97 +++
 src/core/helpers/WindowHelpers.cpp                 | 183 +++++
 src/core/helpers/WindowHelpers.h                   | 172 +++++
 src/core/utils/helpers/bit_ops.h                   |  52 ++
 src/core/utils/helpers/fft.cpp                     |   4 +-
 src/core/utils/helpers/fft.h                       |  55 ++
 src/core/utils/helpers/float_ops.h                 | 116 ++++
 src/core/utils/helpers/tensor_info.h               |  57 ++
 src/core/utils/helpers/tensor_transform.cpp        |   4 +-
 src/graph/algorithms/TopologicalSort.cpp           |   6 +-
 src/graph/backends/CL/CLFunctionsFactory.cpp       |   2 +-
 src/graph/backends/CL/CLNodeValidator.cpp          |   2 +-
 src/graph/backends/CL/CLSubTensorHandle.cpp        |   4 +-
 src/graph/backends/GLES/GCFunctionsFactory.cpp     |   2 +-
 src/graph/backends/GLES/GCNodeValidator.cpp        |   4 +-
 src/graph/backends/NEON/NEFunctionFactory.cpp      |   2 +-
 src/graph/backends/NEON/NENodeValidator.cpp        |   2 +-
 src/graph/backends/NEON/NETensorHandle.cpp         |   4 +-
 .../detail/CrossLayerMemoryManagerHelpers.cpp      |   2 +-
 src/graph/mutators/DepthConcatSubTensorMutator.cpp |   6 +-
 src/graph/mutators/GroupedConvolutionMutator.cpp   |   2 +-
 src/graph/mutators/NodeExecutionMethodMutator.cpp  |   4 +-
 src/graph/mutators/NodeFusionMutator.cpp           |  13 +-
 src/graph/mutators/SplitLayerSubTensorMutator.cpp  |   4 +-
 src/graph/mutators/SyntheticDataTypeMutator.cpp    |   2 +-
 src/runtime/CL/CLHelpers.cpp                       |   3 +-
 src/runtime/CL/CLMemory.cpp                        |   4 +-
 src/runtime/CL/CLRuntimeContext.cpp                |   2 +
 src/runtime/CL/CLTensorAllocator.cpp               |   4 +-
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp      |  11 +-
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |   1 +
 .../functions/CLConvertFullyConnectedWeights.cpp   |   2 +
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |   2 +
 src/runtime/CL/functions/CLCropResize.cpp          |   4 +
 src/runtime/CL/functions/CLDeconvolutionLayer.cpp  |   2 +
 .../CL/functions/CLDirectConvolutionLayer.cpp      |   2 +-
 .../CL/functions/CLDirectDeconvolutionLayer.cpp    |   1 +
 src/runtime/CL/functions/CLFFT1D.cpp               |   2 +-
 src/runtime/CL/functions/CLFFTConvolutionLayer.cpp |   5 +-
 src/runtime/CL/functions/CLFill.cpp                |   2 +
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |   2 +-
 src/runtime/CL/functions/CLGEMM.cpp                |  15 +-
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |   3 +-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  |   7 +-
 .../CL/functions/CLGenerateProposalsLayer.cpp      |   1 +
 .../CL/functions/CLInstanceNormalizationLayer.cpp  |   2 +
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  |   1 +
 src/runtime/CL/functions/CLPriorBoxLayer.cpp       |   2 +
 src/runtime/CL/functions/CLQLSTMLayer.cpp          |   1 +
 src/runtime/CL/functions/CLReduceMean.cpp          |   3 +-
 src/runtime/CL/functions/CLReductionOperation.cpp  |   8 +-
 src/runtime/CL/functions/CLRemap.cpp               |   2 +-
 src/runtime/CL/functions/CLSelect.cpp              |   2 +
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |   7 +-
 src/runtime/CL/functions/CLSplit.cpp               |   1 +
 .../CL/functions/CLWinogradConvolutionLayer.cpp    |   2 +-
 src/runtime/CL/gemm/CLGEMMKernelSelection.h        |  65 ++
 .../CL/gemm/CLGEMMKernelSelectionBifrost.cpp       |   4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h |  55 ++
 .../CL/gemm/CLGEMMKernelSelectionMidgard.cpp       |   4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h |  53 ++
 .../CL/gemm/CLGEMMKernelSelectionValhall.cpp       |   4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h |  53 ++
 src/runtime/CL/tuners/BifrostTuner.cpp             |   2 +-
 src/runtime/CL/tuners/MidgardTuner.cpp             |   2 +-
 src/runtime/CPP/CPPScheduler.cpp                   |   3 +-
 .../CPP/functions/CPPDetectionOutputLayer.cpp      |   1 +
 .../CPP/functions/CPPDetectionPostProcessLayer.cpp |   1 +
 src/runtime/CPUUtils.cpp                           |   9 +-
 src/runtime/CPUUtils.h                             |  51 ++
 src/runtime/DeviceProperties.cpp                   |   6 +-
 src/runtime/GLES_COMPUTE/GCMemory.cpp              |   4 +-
 .../GLES_COMPUTE/functions/GCConcatenateLayer.cpp  |   2 +
 src/runtime/IScheduler.cpp                         |  10 +-
 src/runtime/NEON/INESimpleFunctionNoBorder.cpp     |   6 +-
 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp    |   4 +-
 .../NEON/functions/NEBatchNormalizationLayer.cpp   |   4 +-
 src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp |   4 +-
 src/runtime/NEON/functions/NEConcatenateLayer.cpp  |   1 +
 src/runtime/NEON/functions/NECropResize.cpp        |   2 +
 .../NEON/functions/NEDeconvolutionLayer.cpp        |   1 +
 src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp |   4 +-
 src/runtime/NEON/functions/NEFFT1D.cpp             |   4 +-
 .../NEON/functions/NEFFTConvolutionLayer.cpp       |   7 +-
 .../NEON/functions/NEFullyConnectedLayer.cpp       |   2 +
 src/runtime/NEON/functions/NEGEMM.cpp              |   3 +-
 .../NEON/functions/NEGEMMAssemblyDispatch.cpp      |  10 +-
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     |   1 +
 .../NEON/functions/NEGenerateProposalsLayer.cpp    |   1 +
 .../NEON/functions/NELSTMLayerQuantized.cpp        |   3 +-
 src/runtime/NEON/functions/NEPadLayer.cpp          |   1 +
 src/runtime/NEON/functions/NEPriorBoxLayer.cpp     |   4 +-
 src/runtime/NEON/functions/NEQLSTMLayer.cpp        |   1 +
 src/runtime/NEON/functions/NEReduceMean.cpp        |   3 +-
 .../NEON/functions/NEReductionOperation.cpp        |   1 +
 src/runtime/NEON/functions/NEScale.cpp             |   4 +-
 .../NEON/functions/NESimpleAssemblyFunction.cpp    |   4 +-
 .../NEON/functions/NESimpleAssemblyFunction.h      |  56 ++
 src/runtime/NEON/functions/NESoftmaxLayer.cpp      |   7 +-
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  |   4 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     |  11 +-
 src/runtime/OMP/OMPScheduler.cpp                   |   2 +-
 src/runtime/SchedulerUtils.cpp                     |   4 +
 src/runtime/SchedulerUtils.h                       |  45 ++
 src/runtime/Utils.cpp                              |   7 +-
 src/runtime/Utils.h                                |  60 ++
 450 files changed, 6798 insertions(+), 910 deletions(-)
 create mode 100644 src/core/AccessWindowAutoPadding.h
 create mode 100644 src/core/AccessWindowStatic.h
 create mode 100644 src/core/AccessWindowTranspose.h
 create mode 100644 src/core/CL/CLValidate.h
 create mode 100644 src/core/CL/ICLGEMMKernelConfiguration.h
 create mode 100644 src/core/CL/gemm/CLGEMMHelpers.h
 create mode 100644 src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
 create mode 100644 src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
 create mode 100644 src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
 create mode 100644 src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
 create mode 100644 src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
 create mode 100644 src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
 create mode 100644 src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
 create mode 100644 src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
 create mode 100644 src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
 create mode 100644 src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
 create mode 100644 src/core/CPP/Validate.h
 create mode 100644 src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
 create mode 100644 src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
 create mode 100644 src/core/NEON/kernels/assembly/arm_gemm_local.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/activation.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/alloc.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/arm.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/convolution.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/padding.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/perf.h
 create mode 100644 src/core/NEON/kernels/convolution/common/qasymm8.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/qsymm8.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/shims.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/tensor.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/tensor_utils.hpp
 create mode 100644 src/core/NEON/kernels/convolution/common/utils.hpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
 create mode 100644 src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
 create mode 100644 src/core/helpers/AutoConfiguration.h
 create mode 100644 src/core/helpers/NormalizationHelpers.h
 create mode 100644 src/core/helpers/ScaleHelpers.h
 create mode 100644 src/core/helpers/SoftmaxHelpers.cpp
 create mode 100644 src/core/helpers/SoftmaxHelpers.h
 create mode 100644 src/core/helpers/Utils.h
 create mode 100644 src/core/helpers/WindowHelpers.cpp
 create mode 100644 src/core/helpers/WindowHelpers.h
 create mode 100644 src/core/utils/helpers/bit_ops.h
 create mode 100644 src/core/utils/helpers/fft.h
 create mode 100644 src/core/utils/helpers/float_ops.h
 create mode 100644 src/core/utils/helpers/tensor_info.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelection.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
 create mode 100644 src/runtime/CPUUtils.h
 create mode 100644 src/runtime/NEON/functions/NESimpleAssemblyFunction.h
 create mode 100644 src/runtime/SchedulerUtils.h
 create mode 100644 src/runtime/Utils.h

(limited to 'src')

diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 85c5b27d82..ca2f7d238f 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowAutoPadding.h"
+#include "src/core/AccessWindowAutoPadding.h"
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
new file mode 100644
index 0000000000..b8d1508679
--- /dev/null
+++ b/src/core/AccessWindowAutoPadding.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
+#define ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Dummy access window.
+ *
+ * This implementation always uses the auto padding of the tensor info and
+ * never updates the window. The valid region is always set to cover the entire
+ * tensor.
+ *
+ * @note This access window is only used during the migration to the new
+ *       padding system. It will be removed once all kernels have been ported.
+ *
+ * */
+class AccessWindowAutoPadding : public IAccessWindow
+{
+public:
+    /** Default constructor.
+     *
+     * @param[in,out] info Tensor info of the accessed kernel.
+     */
+    AccessWindowAutoPadding(ITensorInfo *info);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
+    /** Allow instances of this class to be move constructed */
+    AccessWindowAutoPadding(AccessWindowAutoPadding &&) = default;
+    /** Allow instances of this class to be moved */
+    AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default;
+    /** Default destructor */
+    ~AccessWindowAutoPadding() = default;
+
+    /** Set the valid region to match the entire tensor. */
+    void set_valid_region();
+
+    /** Return a valid region that spans across the entire tensor.
+     *
+     * @return a valid region.
+     *
+     */
+    ValidRegion compute_valid_region() const;
+
+    // Inherited methods overridden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+private:
+    ITensorInfo *_info;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H*/
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 10e88b8632..0607011bc5 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
new file mode 100644
index 0000000000..f7d43cbb55
--- /dev/null
+++ b/src/core/AccessWindowStatic.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
+#define ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a static rectangular access pattern.
+ *
+ * In this implementation the access offsets and sizes are not relative to the
+ * current element. Instead they are considered to be absolute coordinates
+ * within the accessed tensor's shape.
+ *
+ * */
+class AccessWindowStatic : public IAccessWindow
+{
+public:
+    /** Constructor for a static access pattern.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     start_x Start of the access in X direction.
+     * @param[in]     start_y Start of the access in Y direction.
+     * @param[in]     end_x   End of the access in X direction.
+     * @param[in]     end_y   End of the access in Y direction.
+     */
+    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    AccessWindowStatic(const AccessWindowStatic &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
+    /** Allow instances of this class to be move constructed */
+    AccessWindowStatic(AccessWindowStatic &&) = default;
+    /** Allow instances of this class to be moved */
+    AccessWindowStatic &operator=(AccessWindowStatic &&) = default;
+    /** Default destructor */
+    ~AccessWindowStatic() = default;
+
+    /** Set the valid region based on the static access pattern and valid
+     *  region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
+
+    /** Compute the valid region based on the static access pattern and valid region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     *
+     * @return a valid region.
+     *
+     */
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
+
+    // Inherited methods overriden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+private:
+    ITensorInfo *_info;
+    int          _start_x;
+    int          _start_y;
+    int          _end_x;
+    int          _end_y;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IACCESS_WINDOW_STATIC_H*/
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 4c03ca16c7..d8bd4c4de1 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "src/core/AccessWindowTranspose.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
new file mode 100644
index 0000000000..0306076d6e
--- /dev/null
+++ b/src/core/AccessWindowTranspose.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
+#define ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a XY-transpose access pattern. */
+class AccessWindowTranspose : public AccessWindowRectangle
+{
+public:
+    using AccessWindowRectangle::AccessWindowRectangle;
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
+    using AccessWindowRectangle::compute_valid_region;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
new file mode 100644
index 0000000000..cbbdf2d9d2
--- /dev/null
+++ b/src/core/CL/CLValidate.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_VALIDATE_H
+#define ARM_COMPUTE_CL_VALIDATE_H
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+
+/** Return an error if int64_base_atomics extension is not supported by the device.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ *
+ * @return Status
+ */
+inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+{
+    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    {
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+    }
+    return arm_compute::Status{};
+}
+
+#define ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
+
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_VALIDATE_H */
diff --git a/src/core/CL/ICLGEMMKernelConfiguration.h b/src/core/CL/ICLGEMMKernelConfiguration.h
new file mode 100644
index 0000000000..ac0e7ab7ff
--- /dev/null
+++ b/src/core/CL/ICLGEMMKernelConfiguration.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
+#define ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Basic interface for the GEMM kernel configuration */
+class ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    ICLGEMMKernelConfiguration(GPUTarget arch)
+        : _target(arch)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration(const ICLGEMMKernelConfiguration &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration &operator=(const ICLGEMMKernelConfiguration &) = delete;
+    /** Default Move Constructor. */
+    ICLGEMMKernelConfiguration(ICLGEMMKernelConfiguration &&) = default;
+    /** Default move assignment operator */
+    ICLGEMMKernelConfiguration &operator=(ICLGEMMKernelConfiguration &&) = default;
+    /** Virtual destructor */
+    virtual ~ICLGEMMKernelConfiguration() = default;
+    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
+     *
+     * @param[in] m         Number of rows LHS matrix
+     * @param[in] n         Number of columns RHS matrix
+     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
+     * @param[in] b         Batch size
+     * @param[in] data_type Data type
+     */
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H */
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index be633746a2..f91510b4a7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,9 @@
  */
 #include "arm_compute/core/CL/ICLKernel.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index ce95495fff..dfef5822b2 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,7 @@
  */
 #include "arm_compute/core/CL/ICLSimple2DKernel.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d2f09a3478..90b5be8069 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
index 0a4a4adc31..877bf1e047 100644
--- a/src/core/CL/gemm/CLGEMMHelpers.cpp
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
diff --git a/src/core/CL/gemm/CLGEMMHelpers.h b/src/core/CL/gemm/CLGEMMHelpers.h
new file mode 100644
index 0000000000..013c068cf7
--- /dev/null
+++ b/src/core/CL/gemm/CLGEMMHelpers.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMHELPERS_H
+#define ARM_COMPUTE_CLGEMMHELPERS_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+struct GEMMRHSMatrixInfo;
+
+namespace cl_gemm
+{
+/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * @param[in] m                  Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n                  Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0                 Number of rows processed by each thread/work-item
+ * @param[in] n0                 Number of columns processed by each thread/work-item
+ * @param[in] k0                 Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0                 Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0                 Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave     True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave     True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose      True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose      True if the (k0xn0) block has to be transposed before been stored
+ * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+
+/** Update padding required to export the OpenCL buffer to OpenCL image2d
+ *
+ * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
+ */
+void update_padding_for_cl_image(ITensorInfo *tensor);
+
+/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
+ *
+ * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
+ * @param[in] rhs_info             @ref GEMMRHSMatrixInfo
+ *
+ * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
+ */
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMHELPERS_H */
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
new file mode 100644
index 0000000000..aecf5a8aa8
--- /dev/null
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
+
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMNative factory class */
+class CLGEMMNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMNative kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H */
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
index 51b7fc7190..4cc3d6ae74 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
new file mode 100644
index 0000000000..1e7432c89a
--- /dev/null
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H */
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
index 3e7c17664a..fd699a08f7 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
new file mode 100644
index 0000000000..2f6671706e
--- /dev/null
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
index efc82fb78c..2c82340eef 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
new file mode 100644
index 0000000000..fb51b02edf
--- /dev/null
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H */
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
new file mode 100644
index 0000000000..21ccf2d647
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
+
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMReshaped factory class */
+class CLGEMMReshapedKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMReshaped kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H */
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
index b5fc074fb4..00c284facc 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -216,17 +216,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
         {
             if(workload <= 790.39f)
             {
-                return configure_lhs_rhs_info(m,n,2,4,4,2,2,false,false,true,false,false);
+                return configure_lhs_rhs_info(m, n, 2, 4, 4, 2, 2, false, false, true, false, false);
             }
             else
             {
                 if(workload <= 982.39f)
                 {
-                    return configure_lhs_rhs_info(m,n,4,2,4,4,4,false,false,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, false, true, false, false);
                 }
                 else
                 {
-                    return configure_lhs_rhs_info(m,n,2,4,4,2,1,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 2, 1, false, true, true, false, false);
                 }
             }
         }
@@ -236,16 +236,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
             {
                 if(r_mn <= 0.11f)
                 {
-                    return configure_lhs_rhs_info(m,n,2,4,4,2,2,false,false,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 2, 2, false, false, true, false, false);
                 }
                 else
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,4,4,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, false, true, true, false, false);
                 }
             }
             else
             {
-                return configure_lhs_rhs_info(m,n,2,4,4,2,2,false,false,true,false,false);
+                return configure_lhs_rhs_info(m, n, 2, 4, 4, 2, 2, false, false, true, false, false);
             }
         }
     }
@@ -257,22 +257,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
             {
                 if(m <= 64.5)
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,2,4,true,false,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 4, true, false, true, false, false);
                 }
                 else
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,2,2,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, false, true, true, false, false);
                 }
             }
             else
             {
                 if(r_mn <= 1.09f)
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,4,4,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, false, true, true, false, false);
                 }
                 else
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,2,2,true,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, true, true, true, false, false);
                 }
             }
         }
@@ -280,17 +280,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
         {
             if(m <= 43)
             {
-                return configure_lhs_rhs_info(m,n,4,4,4,2,4,true,false,true,false,false);
+                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 4, true, false, true, false, false);
             }
             else
             {
                 if(workload <= 26364.79f)
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,2,2,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, false, true, true, false, false);
                 }
                 else
                 {
-                    return configure_lhs_rhs_info(m,n,4,4,4,4,4,false,true,true,false,false);
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, false, true, true, false, false);
                 }
             }
         }
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
new file mode 100644
index 0000000000..e3b62ced6a
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMReshaped configuration */
+class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H */
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
index 0c09f5084a..519e903a5a 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
new file mode 100644
index 0000000000..5f7e701e0e
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMReshaped configuration */
+class CLGEMMReshapedKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H */
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
new file mode 100644
index 0000000000..4efe28ce69
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
+
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMReshapedOnlyRHS factory class */
+class CLGEMMReshapedOnlyRHSKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMReshapedOnlyRHS kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H */
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index 16eabf069c..0a0fc5d152 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -122,13 +122,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 
     if(m == 1)
     {
-        if ( n <= 2548 )
+        if(n <= 2548)
         {
-            return configure_lhs_rhs_info(m,n,1,2,16,1,4,false,true,false,true,false);
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
         else
         {
-            return configure_lhs_rhs_info(m,n,1,4,16,1,8,false,true,false,true,false);
+            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
         }
     }
     else
@@ -251,7 +251,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 
     if(m == 1)
     {
-        return configure_lhs_rhs_info(m,n,1,2,16,1,32,false,true,false,true,false);
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
     else
     {
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
new file mode 100644
index 0000000000..618dbd9923
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
+class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H */
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
index 23bf02c61a..f7939d29c0 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
new file mode 100644
index 0000000000..b9289923b9
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
+#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
+
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
+class CLGEMMReshapedOnlyRHSKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H */
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
index 9deb16524e..29745beee7 100644
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -27,12 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 5a9434ee5a..f0e3047796 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -25,14 +25,14 @@
 
 #include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index b78ac27cfa..b5a801a97f 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
index dd9c234c56..7a8c9ad0fb 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 1c1df6c4eb..09b668d6cd 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -25,12 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index c74f7e055b..6eb22c9ada 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
index 44378c8239..53a438dcf6 100644
--- a/src/core/CL/kernels/CLBitwiseAndKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -27,9 +27,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
index 77c48e6e82..0e2e5d4f3c 100644
--- a/src/core/CL/kernels/CLBitwiseOrKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -27,9 +27,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
index a15305e3b7..65b17c02bd 100644
--- a/src/core/CL/kernels/CLBitwiseXorKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -27,9 +27,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 95ea3d7df5..b8c0d2f2b8 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index 7916dce241..2f6c09df0b 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -26,9 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
index b8a53650e8..c76ec6769e 100644
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index b0e5111417..d574f352ae 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -27,14 +27,12 @@
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 13ae8f5ef4..7911b948ae 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -28,14 +28,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index ad000ba17f..301a762850 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 4050b24e0c..3dc007d9e0 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index e14b871ae6..0f82d87348 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -27,14 +27,12 @@
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <sstream>
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 5bb1d56690..2b72946f49 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -24,8 +24,10 @@
 #include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <map>
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index 7c6114640c..b4e42bf3bc 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
@@ -25,10 +25,11 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index ca07e68345..48b185f78d 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -31,9 +31,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 37c3241302..0b7e9aff53 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
index f828162177..2c99d46929 100644
--- a/src/core/CL/kernels/CLCropKernel.cpp
+++ b/src/core/CL/kernels/CLCropKernel.cpp
@@ -26,16 +26,11 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 
 #include <map>
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index e8f12d5d9d..9ba3dc3d8f 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -25,12 +25,11 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 69730346fe..1514d906dc 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 87067cf717..cb5d727e9b 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index 11297e7901..24f638f8c4 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index e6f909e884..50bb3b827c 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index 066e9a5a40..7958230aac 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -23,19 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 0930fee712..5a0d2d0a62 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -23,19 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index a538ab51cb..5a3a0ec435 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -25,17 +25,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
index 07f25a80cf..0ff3c520ba 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 72eac858ad..e653c59550 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -23,15 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index ab5f9dab76..659a7cb209 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
index ae948314a3..1e59c349e7 100644
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index d5d808a80f..161b221e81 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
index c8c7fb03b8..c7f9df0baa 100644
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
@@ -24,9 +24,10 @@
 #include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index ec33500f20..f0712f5863 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 #include <map>
 
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
index a5eb79f73b..29a32979a3 100644
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 30bca2f0f9..0478f550f9 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 6c36338dae..7b17a227e1 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index ac5f2b38c3..49fcbb6c7b 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
index e71b47228e..ebdfd2741f 100644
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 1ea654b5cc..e92619a242 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -27,13 +27,11 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 6bd1149612..dc1d33869f 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -23,9 +23,10 @@
  */
 #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 09f5f61a50..8884f3fe36 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -25,14 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index b582295f44..61e2b2700a 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -25,12 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index d30a9e5d18..cc98845e0f 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -137,7 +136,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Collapse along the Z direction
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index 56b92a3d41..5469c89c30 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 4770329b7d..4a3ac2da81 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index 6ef9fd2565..7ab96e5fa9 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -23,15 +23,13 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -148,7 +146,8 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I
     configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
 }
 
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
+void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
+                                                   const ICLTensor *bias,
                                                    int32_t k, int32_t a_offset,
                                                    int32_t b_offset)
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 6d3aa6fbf6..85285d6704 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -23,16 +23,15 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index eae66413a6..ab1b5a2203 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -25,14 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index 430a84cfa0..ad5bac015b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -25,14 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 79888cdba2..8e4b291dbe 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 31a97ca32b..826b265dbf 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index c2dd92c0fd..aa69ed06d1 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -23,20 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <set>
@@ -310,7 +309,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
+                                           float beta,
                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
index da57aa447f..cea147b10c 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
@@ -23,20 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index b0f0e8a81f..eaf57086a3 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -23,23 +23,22 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 0ae30ed30e..912c763ed5 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <tuple>
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index f52384593b..04aa061e98 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -23,15 +23,14 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index 156a657f28..f2ad677976 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
index ce294646a0..d94e834d2c 100644
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
@@ -23,20 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 57759fc1c1..a8508bed2d 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
index 08e7e27b3c..c9ed1ac0d7 100644
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 0e20187d1c..2686e8b32e 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 3108ad87d0..a2fcbbab78 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index 7f618b294b..eaf5ea4880 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -27,12 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index fbd2208894..6e14996732 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -27,12 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLHOG.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 08e670f5d2..19c4e579a0 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -23,17 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
index 22b2cfcbc5..1ae2599721 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index b8a4e8619d..a85429c1a0 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -27,12 +27,10 @@
 #include "arm_compute/core/CL/ICLDistribution1D.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstring>
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index c94e313b9a..76490f82f6 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -23,18 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 2ad5233de8..e97b856456 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -25,12 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index aff4bd9cea..82f6da85a5 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -28,9 +28,8 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index a68d8db3c0..9936e29c5f 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index fae5fe2c8e..0fa2e703ec 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -23,16 +23,15 @@
  */
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cmath>
 
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 0da0d4ca1f..6e4c45eab7 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -23,15 +23,13 @@
  */
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
index ef8ebd52e5..dc130d0ff9 100644
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -27,11 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index 08c74642f4..a78996ddae 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -23,19 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index 33099c928d..5acc3ac3d6 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -25,14 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cmath>
 #include <set>
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 5ecbb4b2a6..82a22a9f19 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -25,14 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
index 5f8c9e5a93..4b899502f9 100644
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index f591c2f6d5..186ed2a38c 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 5f0e48dbb9..bf645f82e9 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -23,14 +23,15 @@
  */
 #include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index 9bbda40782..634b58077a 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -28,7 +28,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
index 16e5113c62..0a8472bf04 100644
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -27,12 +27,11 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
index 958d94ce11..9c6d44b6c5 100644
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 7d8e5db2b4..686e6f1b26 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -23,15 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 00bdac3441..407ce6626b 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -23,15 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index b2432a058d..45729738fb 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index dc2d6fe4b4..620665791f 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 229937ef31..a7bd4dad60 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -25,11 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 1771834aac..0570887b91 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -23,18 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 3429ef75d1..202e9fbb37 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index 2f676d30d1..ff6cc86103 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index f6b08884e7..983cbedc0f 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -23,16 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 3f2a904f58..ca6c6fad1a 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -23,18 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index c2ed32653a..55fe5a5321 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index d46cdd78da..a4c30b63c2 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLRangeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 0ba63cc4e0..325e4b994c 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index fe8c81a3b9..8d3f41b35f 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -23,15 +23,14 @@
  */
 #include "arm_compute/core/CL/kernels/CLRemapKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index ab81a8fca3..ade7761b91 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -26,12 +26,12 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index 3daf21a9a7..e08970992e 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
@@ -23,18 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include <string>
 
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 6546ced72e..f8240984d1 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -25,12 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 2e7ee36bcb..8233f210b4 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include "src/core/utils/ScaleUtils.h"
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
index 3172966b8f..1e33af3047 100644
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index dcac78cca3..d9a1044e1f 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -25,12 +25,13 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
index 86dcf22258..89e5207c44 100644
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index e010fdda75..3e765e47fb 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index c2b4bec494..37ceaba502 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -26,11 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index c7881b9f5f..5c0acda41a 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -23,18 +23,19 @@
  */
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3e0ac74f69..c6f70c3c09 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 4b6c1be8c2..2d46aade34 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -24,9 +24,11 @@
 #include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index c283c440a3..5055065779 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index f7b7290a3f..b632e05d84 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -24,10 +24,12 @@
 #include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index bba152530c..43c8953363 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/core/CL/kernels/CLTileKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index a47d956620..bd910196e9 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -23,18 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <sstream>
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index 101055001c..a4fc10f26a 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index e8da803628..95a7c1b875 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index dc7c359849..2fe1feb485 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <set>
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 267957e51a..f69967265a 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -25,6 +25,8 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index 76100c2a63..27c650894c 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -23,16 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 0377eb76b1..5ef2cc46ee 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -23,17 +23,16 @@
  */
 #include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index d40597fbb5..8b0aaf3227 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -25,13 +25,12 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 4a1c48a258..7fab208221 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -23,10 +23,8 @@
  */
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -36,6 +34,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index c4c2b08a81..15c239e849 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -23,10 +23,8 @@
  */
 #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
@@ -34,6 +32,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 19f61b19b3..8ae0255319 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -23,10 +23,8 @@
  */
 #include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -36,6 +34,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 132d5d1cd0..0c7588d740 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
@@ -23,18 +23,20 @@
  */
 #include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Types.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
index 126bf548e2..9e4df5ec8a 100644
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ b/src/core/CPP/ICPPSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -71,4 +72,4 @@ Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *o
     return Status{};
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
new file mode 100644
index 0000000000..9e95f72c3f
--- /dev/null
+++ b/src/core/CPP/Validate.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPP_VALIDATE_H
+#define ARM_COMPUTE_CPP_VALIDATE_H
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+/** Return an error if the data type of the passed tensor info is FP16 and FP16 support is not compiled in.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] tensor_info Tensor info to validate.
+ *
+ * @return Status
+ */
+inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
+                                            const ITensorInfo *tensor_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16,
+                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    return Status {};
+}
+
+/** Return an error if the data type of the passed tensor info is BFLOAT16 and BFLOAT16 support is not compiled in.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] tensor_info Tensor info to validate.
+ *
+ * @return Status
+ */
+inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
+                                            const ITensorInfo *tensor_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
+#if !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16))
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::BFLOAT16,
+                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+#endif /* !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)) */
+    return Status {};
+}
+
+/** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
+ *
+ * @return Status
+ */
+inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
+                                            const ITensor *tensor)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
+    return Status{};
+}
+
+/** Return an error if the data type of the passed tensor is BFLOAT16 and BFLOAT16 support is not compiled in.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
+ *
+ * @return Status
+ */
+inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
+                                            const ITensor *tensor)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
+    return Status{};
+}
+
+#define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPP_VALIDATE_H */
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 917a6ad08b..fb1754247c 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index a0cfb3ba8b..d7af9c9e7a 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,17 +23,9 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/Mutex.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index ec03b72b6b..3166faba48 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 
 #include <algorithm>
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index 89e3058520..c1187ff2b3 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <algorithm>
 
 namespace arm_compute
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 1d1f0cd30e..054c7bf05a 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -23,13 +23,10 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index 7ba8d7cdd0..d2b54e412e 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index ff4ffb6124..7ef83fb2c4 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -23,13 +23,8 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
index 6609f457e2..fb31ac8377 100644
--- a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
index f0a500398b..5e8accc95d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index 1c02f41286..0173b81cf8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index 06c34863d7..f31c8ca156 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
@@ -34,6 +34,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index 3bd34acb92..9281ce5ffb 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index 4fe6484cf8..5781c564ea 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
@@ -24,7 +24,6 @@
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -42,7 +44,7 @@ GCCol2ImKernel::GCCol2ImKernel()
 {
 }
 
-void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output,
+void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
                                std::pair<unsigned int, unsigned int> convolved_dims)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 458cb639a3..3256f11e74 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index cb70dae3ec..95d487b4dd 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -34,6 +33,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 302b21be0d..9ce8acea09 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -44,7 +46,7 @@ GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
 }
 
 template <unsigned int kernel_size>
-BorderSize GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
+BorderSize             GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
 {
     return _border_size;
 }
@@ -70,8 +72,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
     }
 
     // Get convolved dimensions
-    unsigned int owidth       = 0;
-    unsigned int oheight      = 0;
+    unsigned int owidth  = 0;
+    unsigned int oheight = 0;
     std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
 
     TensorShape output_shape = input->info()->tensor_shape();
@@ -238,20 +240,20 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
                 num_elems_written_per_iteration_x = 4;
 #elif defined(PROCESS_4X_2Y_1Z)
                 options.emplace("#define PROCESS_4X_2Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 2;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 2;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 2;
 #elif defined(PROCESS_4X_3Y_1Z)
                 options.emplace("#define PROCESS_4X_3Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 3;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 3;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 3;
 #elif defined(PROCESS_4X_4Y_1Z)
                 options.emplace("#define PROCESS_4X_4Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 4;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 4;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 4;
 #elif defined(PROCESS_4X_2Y_2Z)
diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
index 5c6722af6a..bda6599f86 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
index 3b3118bc3d..7ffcdd2f3f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstdint>
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index e0f7e957d8..d395759558 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index c9eb4337fa..66fdde5473 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index e8298bc327..daad70bba9 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index dd03faf2df..2f69728b61 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -37,6 +35,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
index 4190163694..1d6ef3d0e8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 64f2d63fec..c12dd38cb4 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -24,7 +24,6 @@
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -35,6 +34,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
@@ -91,7 +93,7 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
     int stride_y = 0;
 
     std::tie(stride_x, stride_y) = conv_info.stride();
-    _kernel_dims                 = std::make_pair(kernel_dims.width, kernel_dims.height);
+    _kernel_dims = std::make_pair(kernel_dims.width, kernel_dims.height);
 
     const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
@@ -109,9 +111,9 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
         }
 
         build_opts.emplace("#define IM2COL_GENERIC");
-        _convolved_dims                    = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                                               kernel_dims.width, kernel_dims.height,
-                                                               conv_info, dilation);
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.width, kernel_dims.height,
+                                            conv_info, dilation);
         _num_elems_processed_per_iteration = (input->info()->data_type() == DataType::F32) ? 1 : 2;
 
         build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
index 5fa1987bf1..c29d9fc4d5 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index 6a79990484..971b540a83 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
index 45aa06cc2d..76559146ae 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index a592c09cc0..13efd10532 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index cf10b92dd1..a0795c668f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index f4ed9617fa..39d586da72 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index d06be9b8a6..78b008484e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 66b4a55bd8..3bec05b5f1 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
index 9a430b43cb..bcdbfb60dc 100644
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 6701ff4483..e692cc1e7c 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,162 +25,6 @@
 
 namespace arm_compute
 {
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
-{
-    if(!skip_border)
-    {
-        border_size = BorderSize(0);
-    }
-
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] + border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
-
-        ++n;
-    }
-
-    if(anchor.num_dimensions() > 2)
-    {
-        window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
-{
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
-
-        ++n;
-    }
-
-    if(anchor.num_dimensions() > 2)
-    {
-        window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
-{
-    if(skip_border)
-    {
-        border_size.top    = 0;
-        border_size.bottom = 0;
-    }
-    else
-    {
-        border_size.left  = 0;
-        border_size.right = 0;
-    }
-
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
 ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
                                          InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
 {
@@ -256,19 +100,4 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
 
     return valid_region;
 }
-
-PermutationVector get_permutation_vector_from_softmax_axis(size_t actual_axis)
-{
-    switch(actual_axis)
-    {
-        case 1:
-            return PermutationVector(1U, 0U, 2U, 3U);
-        case 2:
-            return PermutationVector(2U, 1U, 0U, 3U);
-        case 3:
-            return PermutationVector(3U, 1U, 2U, 0U);
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/NETracePoint.cpp b/src/core/NEON/NETracePoint.cpp
index cb0dc1400a..4a6bffa54e 100644
--- a/src/core/NEON/NETracePoint.cpp
+++ b/src/core/NEON/NETracePoint.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/core/TracePoint.h"
 
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
 #include "utils/TypePrinter.h"
 
 #include <memory>
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index 3d4800fe15..acea0af02d 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index 7c85f698ae..73ef7eb66f 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index d80aab7069..9616f4faca 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <set>
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 525e2866f2..7f1a35fb55 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <map>
 #include <string>
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index a3da7508ab..49e503fac4 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index c7169d8932..65ac996f46 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 50e46474b5..bda396662f 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index eb28ce0a8b..e24d7b6c0a 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index caaa6c22e8..2d49ff825e 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 4da07f93b0..eed9b273ae 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 591acf50e1..f96117e860 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index b0aec4078f..45d2b0a0db 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 56444dcbc0..5a18e88321 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index d5d03a9def..1177f6f1dd 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 
 using namespace arm_compute;
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 0278bb08e1..da33c1b1ea 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,6 +30,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 0de6c4326a..7bd380831b 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 800c63606f..86245acd05 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 88cd0ae514..6d04d71534 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 6a07defd79..f3192370a6 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index bc8c77543a..f933a2a898 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/kernels/detail/NEColorConvertHelper.inl"
 
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index 97bb8ccb8a..8716cfd9b5 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
index f40f1215d3..bd8ea30fb3 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 7103fa1618..69b65b2816 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index 3d00139263..b299957b57 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 7c65e71727..5fb55d95a9 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/NEON/kernels/NECropKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index cec0e1ce60..5628802783 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 6066326fec..b500268477 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -34,6 +34,8 @@
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index ee23909bd6..259ece7c6f 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
-
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6465848999..403e7aac9f 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -29,6 +29,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index 0a34ee6a07..533b374594 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
 #include "src/core/NEON/wrapper/traits.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
@@ -48,7 +50,6 @@ constexpr size_t vector_size          = 8;
 
 struct DepthwiseConvolutionRunInfo
 {
-public:
     const size_t   num_read_elements_per_iteration;
     const uint32_t x_start;
     const uint32_t x_end;
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 9352088b1f..2f3c6f431c 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index ad590e9f2b..5d3fc01bd2 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index c30dab22c6..cc781c699f 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index c22fa6a2b3..56cd6e62d0 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -26,8 +26,6 @@
 #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -36,7 +34,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 8c11574755..abaaf12e92 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,9 +30,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index f862d04b22..efe6161096 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <map>
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 40430bdb81..8e4b7eda30 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 4b93c3b4d1..31b0f487d6 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index d5b20d278d..d8036f2f60 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index de8ba3f484..1b0af488a2 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,15 +28,16 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
 #include <complex>
 #include <map>
 
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 namespace arm_compute
 {
 namespace
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index d99ff953fc..0cb8b84db8 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 7b1d81e12c..99312f5134 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 6b22dadd08..93798db6c3 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index dbaec83d04..c1dd5cf81f 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index 35ebc5b70b..e6b34b6165 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 301dc7a422..48f964c6a2 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/kernels/floor/impl/list.h"
 #include "src/core/common/Registrars.h"
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 00d251f79e..e353df1c39 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <map>
 
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 8b4ad0da23..2997c1d003 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index f3ba2901cb..acc519012b 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -31,6 +31,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 
 using namespace arm_compute;
@@ -1052,5 +1056,3 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
     }
 }
 } // namespace arm_compute
-
-
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 4ac33d1e29..1c76926546 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,6 +31,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 8d0d7c26a3..6a7d225167 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,8 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 023b798b9a..659c4105c1 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,7 +31,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 68f16c5fc7..afa8cec76f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -33,7 +32,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/NESymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 2ef32c4e81..83416e03e9 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -33,7 +32,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/NEAsymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 8fc33dcc82..1e8aa0cc0a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -33,7 +32,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/NEAsymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 1494cd459c..566872f02c 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index bd931469a3..9aee26ca55 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 6f74e3fc06..a9236890e3 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,9 +31,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index a8adc45645..b9b4fe9e9c 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 906e8a053e..193fe98c7b 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -32,6 +31,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 18dd80e283..5ff5db7266 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index 99b5d4b093..5bb3e76ded 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 83d2877836..62cf414df2 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index c3b105919b..483f204b04 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 84bb59ef0e..00f4087cbc 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index eb0d45000a..d5dfa4195d 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 340c694a7c..be68b9c44b 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
index fc7b819f6a..a50712598a 100644
--- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 0f8397f117..12d1bb8e7e 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 6eae0541aa..915ea75431 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 78acbc399d..7aa23de6eb 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,8 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 58ee3b4bea..5fc6ca65e3 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index d99def53ba..a216981f0f 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 533c241b9b..6567a8d206 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 9eafe18020..b8e6a6d763 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -33,7 +32,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index a0c1dbc668..8d82e1abd6 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 821bf53817..87caf00477 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/ToolchainSupport.h"
 
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 914a21c0a0..c4e036a8b9 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index bcce843638..8ee9ff6f40 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 72225a4f43..86fcc30e91 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <utility>
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index 3870fa57f0..fd427cc8c5 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index b1c2b1c376..f675c391ed 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index e956f9a8d0..e1691dc8ff 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index f20e869272..31919ead03 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 3e4c6e29d3..9566ced768 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 7b888266fb..1b72a3e277 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -23,17 +23,20 @@
  */
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 1b52117bbe..ca9c5419e0 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 3f447f90b9..eab11ebfff 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -30,10 +30,12 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace
 {
-#include "arm_compute/core/NEON/kernels/convolution/common/shims.hpp"
+#include "src/core/NEON/kernels/convolution/common/shims.hpp"
 } // namespace
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index c5320b9dbf..0847cb1f23 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 397eae94ea..f9636dcb8d 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -33,9 +31,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 #include "src/core/NEON/wrapper/wrapper.h"
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 808b68a0d7..06a1f14e5f 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 6a038f8f44..55585b4e00 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 6d5202d6b5..990e4b67bc 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -31,8 +31,10 @@
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 
 #include <arm_neon.h>
 #include <map>
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 955cdc2074..79f7888eba 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 6a960c74dc..a3171d9aa6 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 #include <cfloat>
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index 7d8fbb1ec1..3466794b11 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/Utils.h"
 
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 9af7f2ab10..716b092396 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -32,9 +31,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
 
 #include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 2881161d7f..f698439507 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NERemapKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -175,6 +178,8 @@ void NERemapKernel::remap_nearest(const Window &window)
 
 void NERemapKernel::remap_bilinear(const Window &window)
 {
+    using namespace scale_helpers;
+
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
     Window win_in(window);
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 317bc25967..1c48a5c93d 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 23b349b443..7946812811 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -33,6 +31,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 0c44a7e0c9..2c081cb917 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 94f5a18102..e07fcad0ab 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -336,6 +338,8 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 
 void NEScaleKernel::scale_area_nchw_u8(const Window &window)
 {
+    using namespace scale_helpers;
+
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
 
     // Don't increment in width/height/channels for the input tensor
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index dcc9362cf0..eb1dc65c0f 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 286b8a63c8..2f36db2ddb 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NESelectKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -31,7 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "utils/TypePrinter.h"
 
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index eb9d3c3020..1c7089b641 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index fc8ccc803d..2421ea72ad 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 95ab12b6cd..779d67a044 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index e71818f213..13f0a54275 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,10 +30,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index ccad92a685..3293466979 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -29,6 +29,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 2667611d2c..7c9cc4996b 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -29,6 +29,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 1d44be60a0..ad7f1b1300 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 243a60f249..13b2cb5a10 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 9e8ec5c106..aad440b120 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/wrapper/wrapper.h"
 
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index cc7655a479..99651c8b8a 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 7118e45f1e..6037810a44 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -23,14 +23,16 @@
  */
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 69324c1693..129c83c695 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,7 +30,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index d8191dce53..891304f02c 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -31,6 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 
@@ -184,7 +187,7 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -271,7 +274,7 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -386,7 +389,7 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -519,7 +522,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -620,7 +623,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -752,7 +755,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 6a74914ff7..c7fa2d2365 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -25,6 +25,8 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index d12b10c69e..90afbd6a19 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index bfe97bfbdb..211ebdec90 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/MemorySupport.h"
 
 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 94df4f6952..bf5d77fc43 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
 
 #include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
+#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
+#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 
 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
 
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index 591aa1e5e6..48c0616b35 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
index b071be3749..760274dba1 100644
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
new file mode 100644
index 0000000000..030f1aad12
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_INEGEMMWRAPPERKERNEL_H
+#define SRC_INEGEMMWRAPPERKERNEL_H
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for all the arm_gemm Gemms
+ */
+class INEGEMMWrapperKernel : public INEKernel
+{
+public:
+    /** Parameters defining the dimensions of the matrices being multiplied */
+    struct Params
+    {
+        unsigned int M{ 0 };       /**< Rows in output matrix C (and input matrix A). */
+        unsigned int N{ 0 };       /**< Columns in output matrix C (and input matrix B). */
+        unsigned int K{ 0 };       /**< Columns of input matrix A (= rows of input matrix B). */
+        unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
+        unsigned int multis{ 0 };  /**< Number of "multi" GEMMs (unique A, B and C). */
+    };
+
+    static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
+
+    /** Constructor */
+    INEGEMMWrapperKernel();
+    /** Prevent instances of this class from being copied */
+    INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]  a         Input tensor (Matrix A)
+     * @param[in]  b         Input tensor (Matrix B)
+     * @param[out] c         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  alpha     Scalar multiplier to apply to AB matrix product.
+     * @param[in]  beta      Scalar multiplier to apply to input C matrix before adding product.
+     * @param[in]  gemm_info GEMM meta-data
+     */
+    void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    /** Called as part of configure() after _a, _b, _c and _params have been set.
+     *
+     * @param[in] alpha Scalar multiplier to apply to AB matrix product.
+     * @param[in] beta  Scalar multiplier to apply to input C matrix before adding product.
+     *
+     * @return A 3D execution window.
+     */
+    virtual Window configure_internal(float alpha, float beta) = 0;
+
+    /** Run the kernel from the start to the end offset in window.
+     *
+     * @param[in] window       Window to use for the iteration
+     * @param[in] start_offset Where to start iterating from (In Window coordinates)
+     * @param[in] end_offset   Where to stop iterating (In Window coordinates).
+     * @param[in] info         Info about executing thread and CPU.
+     */
+    virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
+
+    const ITensor *_a;
+    const ITensor *_b;
+    ITensor       *_c;
+    Params         _params;
+    GEMMInfo       _gemm_info;
+
+private:
+    Window      _window3d;
+    TensorShape _window_shape;
+};
+
+} // namespace arm_compute
+
+#endif /* SRC_INEGEMMRAPPERKERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
new file mode 100644
index 0000000000..a2f7e3bd59
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
+#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** This class is a wrapper for the depthwise convolution assembly kernels.  */
+class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
+    }
+
+    /** Default constructor */
+    NEDepthwiseConvolutionAssemblyKernelWrapper()
+        : _kernel(nullptr)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
+    /** Default Move Constructor. */
+    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+    /** Default move assignment operator */
+    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in] kernel Pointer to an assembly kernel implementation.
+     */
+    void configure(depthwise::IDepthwiseConvolution *kernel)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
+        _kernel = kernel;
+        Window win;
+        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
+        INEKernel::configure(win);
+    }
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+        auto first = window.x().start();
+        auto last  = window.x().end();
+        _kernel->run(first, last, info.thread_id);
+    }
+
+private:
+    depthwise::IDepthwiseConvolution *_kernel;
+};
+} // namespace arm_compute
+#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/arm_gemm_local.hpp b/src/core/NEON/kernels/assembly/arm_gemm_local.hpp
new file mode 100644
index 0000000000..4715f2500a
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+namespace arm_gemm
+{
+using CPUModel = arm_compute::CPUModel;
+using CPUInfo  = arm_compute::CPUInfo;
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/convolution/common/activation.hpp b/src/core/NEON/kernels/convolution/common/activation.hpp
new file mode 100644
index 0000000000..0c9b7c1368
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/activation.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace neon_convolution_kernels
+{
+
+enum class ActivationFunction
+{
+  None,
+  ReLU,
+  ReLU6,
+};
+
+}
diff --git a/src/core/NEON/kernels/convolution/common/alloc.hpp b/src/core/NEON/kernels/convolution/common/alloc.hpp
new file mode 100644
index 0000000000..7be3cdaaf5
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/alloc.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef ALLOC_ALIGN
+#define ALLOCATE(x) aligned_alloc(ALLOC_ALIGN, x)
+#else
+#define ALLOCATE(x) malloc(x)
+#endif
diff --git a/src/core/NEON/kernels/convolution/common/arm.hpp b/src/core/NEON/kernels/convolution/common/arm.hpp
new file mode 100644
index 0000000000..b19bf98252
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/arm.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sets the macro __arm_any__ if compiling for Aarch32 or Aarch64.
+ *  Includes `arm_neon.h` if compiling for either architecture.
+ */
+
+#ifdef __arm__
+#define __arm_any__
+#endif  // __arm__
+
+#ifdef __aarch64__
+#define __arm_any__
+#endif  // __aarch64__
+
+#ifdef __arm_any__
+#include <arm_neon.h>
+#endif  // __arm_any__
diff --git a/src/core/NEON/kernels/convolution/common/convolution.hpp b/src/core/NEON/kernels/convolution/common/convolution.hpp
new file mode 100644
index 0000000000..b1413527c3
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/convolution.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+enum PaddingType {
+  PADDING_SAME, PADDING_VALID
+};
diff --git a/src/core/NEON/kernels/convolution/common/padding.hpp b/src/core/NEON/kernels/convolution/common/padding.hpp
new file mode 100644
index 0000000000..b6f95872c0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/padding.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+// Utilities for copying tensor tiles and adding/removing padding.
+namespace padding
+{
+
+/* Copy a tile and apply padding to the output copy.
+ */
+template <typename T>
+void copy_and_pad_tile(
+  unsigned int tile_rows,
+  unsigned int tile_cols,
+  unsigned int n_channels,
+  const T *inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  T* outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride,
+  unsigned int pad_top,
+  unsigned int pad_left,
+  unsigned int pad_bottom,
+  unsigned int pad_right,
+  T pad_value=static_cast<T>(0)
+);
+
+/** Copy a tile and remove padding elements in the output.
+ */
+template <unsigned int TileRows, unsigned int TileCols>
+class CopyCropped
+{
+  public:
+    static void execute(
+      size_t size,  // Amount of data to copy
+      const void *inptr,
+      size_t in_row_stride,
+      size_t in_col_stride,
+      void *outptr,
+      size_t out_row_stride,
+      size_t out_col_stride,
+      unsigned int pad_top,
+      unsigned int pad_left,
+      unsigned int pad_bottom,
+      unsigned int pad_right
+    );
+};
+
+template <typename T>
+void crop_and_copy_tile(
+  unsigned int tile_rows,
+  unsigned int tile_cols,
+  unsigned int n_channels,
+  const T *inptr,
+  unsigned int in_row_stride,
+  unsigned int in_col_stride,
+  T *outptr,
+  unsigned int out_row_stride,
+  unsigned int out_col_stride,
+  unsigned int crop_top,
+  unsigned int crop_left,
+  unsigned int crop_bottom,
+  unsigned int crop_right
+);
+
+}
diff --git a/src/core/NEON/kernels/convolution/common/perf.h b/src/core/NEON/kernels/convolution/common/perf.h
new file mode 100644
index 0000000000..fbae4dcdfa
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/perf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* Prototypes from perf.c */
+
+void start_counter(int fd);
+long long get_counter(int fd);
+long long stop_counter(int fd);
+int open_instruction_counter(void);
+int open_cycle_counter(void);
diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.hpp b/src/core/NEON/kernels/convolution/common/qasymm8.hpp
new file mode 100644
index 0000000000..88ef7327c0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+
+namespace qasymm8
+{
+
+struct QAsymm8Params
+{
+  uint8_t quantize(float value) const;
+  float dequantize(uint8_t value) const;
+
+  uint8_t offset;
+  float scale;
+};
+
+struct QAsymm8RescaleParams
+{
+  static QAsymm8RescaleParams make_rescale_params(
+    const QAsymm8Params& weight_quant,
+    const QAsymm8Params& input_quant,
+    const QAsymm8Params& output_quant
+  );
+
+  QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
+
+  const int32_t shift, multiplier;
+  const float rescale;
+};
+
+}
diff --git a/src/core/NEON/kernels/convolution/common/qsymm8.hpp b/src/core/NEON/kernels/convolution/common/qsymm8.hpp
new file mode 100644
index 0000000000..726a02ccfd
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/qsymm8.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+#include <vector>
+#include "qasymm8.hpp"
+
+
+namespace qsymm8 {
+
+struct QSymm8Params {
+  int8_t quantize(float value) const;
+  float dequantize(int8_t value) const;
+
+  float scale;
+};
+
+struct QSymm8RescaleParams {
+  static QSymm8RescaleParams
+  make_rescale_params(const QSymm8Params &weight_quant,
+                      const QSymm8Params &input_quant,
+                      const QSymm8Params &output_quant);
+
+  QSymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
+
+  const int32_t shift, multiplier;
+  const float rescale;
+};
+
+struct QSymm8PerChannelParams {
+  int8_t quantize(float value, float scale) const;
+  float dequantize(int8_t value, float scale) const;
+
+  std::vector<float> scales;
+};
+
+struct QSymm8PerChannelRescaleParams {
+  static QSymm8PerChannelRescaleParams
+  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
+                      const QSymm8PerChannelParams &input_quant,
+                      const QSymm8PerChannelParams &output_quant);
+
+  static QSymm8PerChannelRescaleParams
+  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
+                      const qasymm8::QAsymm8Params &input_quant,
+                      const qasymm8::QAsymm8Params &output_quant);
+
+  QSymm8PerChannelRescaleParams(std::vector<int32_t>& shift, std::vector<int32_t>& multiplier, std::vector<float>& rescale);
+
+  std::vector<int32_t>  shifts, multipliers;
+  std::vector<float> rescales;
+};
+
+} // namespace qsymm8
diff --git a/src/core/NEON/kernels/convolution/common/shims.hpp b/src/core/NEON/kernels/convolution/common/shims.hpp
new file mode 100644
index 0000000000..310bd47b82
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/shims.hpp
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2017 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#ifndef DOXYGEN_SKIP_THIS
+#include <cstdint>
+#endif /* DOXYGEN_SKIP_THIS */
+#include "arm.hpp"
+
+namespace reorder {
+/** Re-order a tensor from NCHW format to NHWC.
+ *
+ * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
+ *
+ * @param[in] in Input tensor in NCHW format.
+ * @param[out] out Output tensor, to be written in NHWC format.
+ * @param n_batches Number of batches in the tensors.
+ * @param n_channels Number of channels in the tensors
+ * @param n_rows Height of the tensor
+ * @param n_cols Width of the tensor
+ * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_channels * in_channel_stride`.
+ * @param in_channel_stride Stride over channels in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
+ * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols`.
+ * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
+ * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols * out_col_stride`.
+ * @param out_col_stride Stride over columns in the output tensor. If `0` defaults to `n_channels`.
+ */
+template <typename T>
+inline void nchw_to_nhwc(
+  const T* const in,
+  T* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride=0,
+  int in_channel_stride=0,
+  int in_row_stride=0,
+  int out_batch_stride=0,
+  int out_row_stride=0,
+  int out_col_stride=0
+);
+
+/** Re-order a tensor from NHWC format to NCHW.
+ *
+ * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
+ *
+ * @param[in] in Input tensor in NHWC format.
+ * @param[out] out Output tensor, to be written in NCHW format.
+ * @param n_batches Number of batches in the tensors.
+ * @param n_rows Height of the tensor
+ * @param n_cols Width of the tensor
+ * @param n_channels Number of channels in the tensors
+ * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
+ * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols * in_col_stride`.
+ * @param in_col_stride Stride over columns in the input tensor. If `0` defaults to `n_channels`.
+ * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_channels * out_channel_stride`.
+ * @param out_channel_stride Stride over channels in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
+ * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols`.
+ */
+template <typename T>
+inline void nhwc_to_nchw(
+  const T* const in,  // Input data in NHWC form
+  T* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride=0,
+  int in_row_stride=0,
+  int in_col_stride=0,
+  int out_batch_stride=0,
+  int out_channel_stride=0,
+  int out_row_stride=0
+);
+
+/** Re-order a weight tensor from [Output feature map x Input feature map x
+ *  Height x Width] format to [Height x Width x Input feature map x Output
+ *  feature map] format.
+ */
+template <typename T>
+inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
+  const T* const in,  // Input in [Output x Input x Height x Width] form
+  T* const out,       // Output in [Height x Width x Input x Output] form
+  const int n_output_feature_maps,
+  const int n_input_feature_maps,
+  const int n_rows,
+  const int n_cols,
+  int in_output_feature_map_stride=0,
+  int in_input_feature_map_stride=0,
+  int in_row_stride=0,
+  int out_row_stride=0,
+  int out_col_stride=0,
+  int out_input_feature_map_stride=0
+);
+
+/** Re-order a weight tensor from [Height x Width x Input feature map x Output
+ *  feature map] format to [Output feature map x Input feature map x Height x
+ *  Width] format.
+ */
+template <typename T>
+inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
+  const T* const in,  // Input in [Height x Width x Input x Output] form
+  T* const out,       // Output in [Output x Input x Height x Width] form
+  const int n_rows,
+  const int n_cols,
+  const int n_input_feature_maps,
+  const int n_output_feature_maps,
+  int in_row_stride=0,
+  int in_col_stride=0,
+  int in_input_feature_map_stride=0,
+  int out_output_feature_map_stride=0,
+  int out_input_feature_map_stride=0,
+  int out_row_stride=0
+);
+
+/*****************************************************************************/
+/* 32-bit implementation : NCHW -> NHWC
+ */
+template <>
+inline void nchw_to_nhwc(
+  const int32_t* const in,
+  int32_t* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride,
+  int in_channel_stride,
+  int in_row_stride,
+  int out_batch_stride,
+  int out_row_stride,
+  int out_col_stride
+)
+{
+  typedef int32_t T;
+
+  // Fill in the stride values
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
+  in_channel_stride = (in_channel_stride) ? in_channel_stride
+                                          : n_rows * in_row_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_channels * in_channel_stride;
+
+  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_rows * out_row_stride;
+
+  // Perform the re-ordering
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_row = in_batch + i*in_row_stride;
+      T* const out_row = out_batch + i*out_row_stride;
+
+      int j = 0, j_remaining = n_cols;
+#ifdef __arm_any__
+      for (; j_remaining >= 4; j += 4, j_remaining -= 4)
+      {
+        int c = 0, c_remaining = n_channels;
+        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
+        {
+          // Read 4 channels worth of 4 columns, then zip to produce 4 columns
+          // worth of 4 channels.
+          int32x4_t channel_pixels[4];
+          channel_pixels[0] = vld1q_s32(in_row + (c + 0)*in_channel_stride + j);
+          channel_pixels[1] = vld1q_s32(in_row + (c + 1)*in_channel_stride + j);
+          channel_pixels[2] = vld1q_s32(in_row + (c + 2)*in_channel_stride + j);
+          channel_pixels[3] = vld1q_s32(in_row + (c + 3)*in_channel_stride + j);
+
+          const auto zip1 = vzipq_s32(channel_pixels[0], channel_pixels[2]);
+          const auto zip2 = vzipq_s32(channel_pixels[1], channel_pixels[3]);
+          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
+          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
+
+          vst1q_s32(out_row + (j + 0)*out_col_stride + c, out_0.val[0]);
+          vst1q_s32(out_row + (j + 1)*out_col_stride + c, out_0.val[1]);
+          vst1q_s32(out_row + (j + 2)*out_col_stride + c, out_1.val[0]);
+          vst1q_s32(out_row + (j + 3)*out_col_stride + c, out_1.val[1]);
+        }
+        for (; c_remaining; c++, c_remaining--)
+        {
+          for (int _j = 0; _j < 4; _j++)
+          {
+            const T* const in_col = in_row + j + _j;
+            T* const out_col = out_row + (j + _j)*out_col_stride;
+            const T* const in_channel = in_col + c*in_channel_stride;
+            out_col[c] = *(in_channel);
+          }
+        }
+      }
+      for (; j_remaining >= 2; j += 2, j_remaining -= 2)
+      {
+        int c = 0, c_remaining = n_channels;
+        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
+        {
+          // Read 2 channels worth of 2 columns, then zip to produce 2 columns
+          // worth of 2 channels.
+          int32x2_t channel_pixels[2];
+          channel_pixels[0] = vld1_s32(in_row + (c + 0)*in_channel_stride + j);
+          channel_pixels[1] = vld1_s32(in_row + (c + 1)*in_channel_stride + j);
+
+          const auto output = vzip_s32(channel_pixels[0], channel_pixels[1]);
+
+          vst1_s32(out_row + (j + 0)*out_col_stride + c, output.val[0]);
+          vst1_s32(out_row + (j + 1)*out_col_stride + c, output.val[1]);
+        }
+        for (; c_remaining; c++, c_remaining--)
+        {
+          for (int _j = 0; _j < 2; _j++)
+          {
+            const T* const in_col = in_row + j + _j;
+            T* const out_col = out_row + (j + _j)*out_col_stride;
+            const T* const in_channel = in_col + c*in_channel_stride;
+            out_col[c] = *(in_channel);
+          }
+        }
+      }
+#endif  // __arm_any__
+      for (; j_remaining; j++, j_remaining--)
+      {
+        const T* const in_col = in_row + j;
+        T* const out_col = out_row + j*out_col_stride;
+
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_col + c*in_channel_stride;
+          out_col[c] = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+template <>
+inline void nchw_to_nhwc(
+  const uint32_t* const in,
+  uint32_t* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride,
+  int in_channel_stride,
+  int in_row_stride,
+  int out_batch_stride,
+  int out_row_stride,
+  int out_col_stride
+)
+{
+  nchw_to_nhwc(
+    reinterpret_cast<const int32_t*>(in),
+    reinterpret_cast<int32_t*>(out),
+    n_batches, n_channels, n_rows, n_cols,
+    in_batch_stride, in_channel_stride, in_row_stride,
+    out_batch_stride, out_row_stride, out_col_stride
+  );
+}
+
+template <>
+inline void nchw_to_nhwc(
+  const float* const in,
+  float* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride,
+  int in_channel_stride,
+  int in_row_stride,
+  int out_batch_stride,
+  int out_row_stride,
+  int out_col_stride
+)
+{
+  nchw_to_nhwc(
+    reinterpret_cast<const int32_t*>(in),
+    reinterpret_cast<int32_t*>(out),
+    n_batches, n_channels, n_rows, n_cols,
+    in_batch_stride, in_channel_stride, in_row_stride,
+    out_batch_stride, out_row_stride, out_col_stride
+  );
+}
+
+/*****************************************************************************/
+/* Generic implementation : NCHW -> NHWC
+ */
+template <typename T>
+inline void nchw_to_nhwc(
+  const T* const in,
+  T* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride,
+  int in_channel_stride,
+  int in_row_stride,
+  int out_batch_stride,
+  int out_row_stride,
+  int out_col_stride
+)
+{
+  // Fill in the stride values
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
+  in_channel_stride = (in_channel_stride) ? in_channel_stride
+                                          : n_rows * in_row_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_channels * in_channel_stride;
+
+  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_rows * out_row_stride;
+
+  // Perform the re-ordering
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_row = in_batch + i*in_row_stride;
+      T* const out_row = out_batch + i*out_row_stride;
+
+      for (int j = 0; j < n_cols; j++)
+      {
+        const T* const in_col = in_row + j;
+        T* const out_col = out_row + j*out_col_stride;
+
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_col + c*in_channel_stride;
+          out_col[c] = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+/*****************************************************************************/
+/* 32-bit implementation : NHWC -> NCHW
+ */
+template <>
+inline void nhwc_to_nchw(
+  const int32_t* const in,  // Input data in NHWC form
+  int32_t* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride,
+  int in_row_stride,
+  int in_col_stride,
+  int out_batch_stride,
+  int out_channel_stride,
+  int out_row_stride
+)
+{
+  typedef int32_t T;
+
+  // Fill in stride values
+  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_rows * in_row_stride;
+
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
+  out_channel_stride = (out_channel_stride) ? out_channel_stride
+                                            : n_rows * out_row_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_channels * out_channel_stride;
+
+  // Perform the re-ordering
+  // For every batch
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    // For every row
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_i = in_batch + i*in_row_stride;
+      T* const out_i = out_batch + i*out_row_stride;
+
+      // For every column, beginning with chunks of 4
+      int j = 0, j_remaining = n_cols;
+#ifdef __arm_any__
+      for (; j_remaining >= 4; j += 4, j_remaining -=4)
+      {
+        // For every channel, beginning with chunks of 4
+        int c = 0, c_remaining = n_channels;
+        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
+        {
+          // Read 4 columns worth of 4 channels then zip to produce 4 channels
+          // worth of 4 columns.
+          int32x4_t pixel_channels[4];
+          pixel_channels[0] = vld1q_s32(in_i + (j + 0)*in_col_stride + c);
+          pixel_channels[1] = vld1q_s32(in_i + (j + 1)*in_col_stride + c);
+          pixel_channels[2] = vld1q_s32(in_i + (j + 2)*in_col_stride + c);
+          pixel_channels[3] = vld1q_s32(in_i + (j + 3)*in_col_stride + c);
+
+          const auto zip1 = vzipq_s32(pixel_channels[0], pixel_channels[2]);
+          const auto zip2 = vzipq_s32(pixel_channels[1], pixel_channels[3]);
+          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
+          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
+
+          vst1q_s32(out_i + j + (c + 0)*out_channel_stride, out_0.val[0]);
+          vst1q_s32(out_i + j + (c + 1)*out_channel_stride, out_0.val[1]);
+          vst1q_s32(out_i + j + (c + 2)*out_channel_stride, out_1.val[0]);
+          vst1q_s32(out_i + j + (c + 3)*out_channel_stride, out_1.val[1]);
+        }
+        for (; c_remaining; c++, c_remaining--)
+        {
+          for (int _j = 0; _j < 4; _j++)
+          {
+            const T* const in_j = in_i + (j + _j)*in_col_stride;
+            T* const out_j = out_i + (j + _j);
+
+            const T* const in_channel = in_j + c;
+            T* const out_channel = out_j + c*out_channel_stride;
+            *(out_channel) = *(in_channel);
+          }
+        }
+      }
+      for (; j_remaining >= 2; j += 2, j_remaining -=2)
+      {
+        int c = 0, c_remaining = n_channels;
+        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
+        {
+          // Read 2 columns worth of 2 channels then zip to produce 2 channels
+          // worth of 2 columns.
+          int32x2_t pixel_channels[2];
+          pixel_channels[0] = vld1_s32(in_i + (j + 0)*in_col_stride + c);
+          pixel_channels[1] = vld1_s32(in_i + (j + 1)*in_col_stride + c);
+
+          const auto output = vzip_s32(pixel_channels[0], pixel_channels[1]);
+
+          vst1_s32(out_i + j + (c + 0)*out_channel_stride, output.val[0]);
+          vst1_s32(out_i + j + (c + 1)*out_channel_stride, output.val[1]);
+        }
+        for (; c_remaining; c++, c_remaining--)
+        {
+          for (int _j = 0; _j < 2; _j++)
+          {
+            const T* const in_j = in_i + (j + _j)*in_col_stride;
+            T* const out_j = out_i + (j + _j);
+
+            const T* const in_channel = in_j + c;
+            T* const out_channel = out_j + c*out_channel_stride;
+            *(out_channel) = *(in_channel);
+          }
+        }
+      }
+#endif  // __arm_any__
+      for (; j_remaining; j++, j_remaining--)
+      {
+        const T* const in_j = in_i + j*in_col_stride;
+        T* const out_j = out_i + j;
+
+        // For every channel
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_j + c;
+          T* const out_channel = out_j + c*out_channel_stride;
+          *(out_channel) = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+template <>
+inline void nhwc_to_nchw(
+  const uint32_t* const in,  // Input data in NHWC form
+  uint32_t* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride,
+  int in_row_stride,
+  int in_col_stride,
+  int out_batch_stride,
+  int out_channel_stride,
+  int out_row_stride
+)
+{
+  // Redirect to generic 32-bit implementation
+  nhwc_to_nchw(
+    reinterpret_cast<const int32_t*>(in),
+    reinterpret_cast<int32_t*>(out),
+    n_batches, n_rows, n_cols, n_channels,
+    in_batch_stride, in_row_stride, in_col_stride,
+    out_batch_stride, out_channel_stride, out_row_stride
+  );
+}
+
+template <>
+inline void nhwc_to_nchw(
+  const float* const in,  // Input data in NHWC form
+  float* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride,
+  int in_row_stride,
+  int in_col_stride,
+  int out_batch_stride,
+  int out_channel_stride,
+  int out_row_stride
+)
+{
+  // Redirect to generic 32-bit implementation
+  nhwc_to_nchw(
+    reinterpret_cast<const int32_t*>(in),
+    reinterpret_cast<int32_t*>(out),
+    n_batches, n_rows, n_cols, n_channels,
+    in_batch_stride, in_row_stride, in_col_stride,
+    out_batch_stride, out_channel_stride, out_row_stride
+  );
+}
+
+/*****************************************************************************/
+/* Generic implementation : NHWC -> NCHW
+ */
+template <typename T>
+inline void nhwc_to_nchw(
+  const T* const in,  // Input data in NHWC form
+  T* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride,
+  int in_row_stride,
+  int in_col_stride,
+  int out_batch_stride,
+  int out_channel_stride,
+  int out_row_stride
+)
+{
+  // Fill in stride values
+  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_rows * in_row_stride;
+
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
+  out_channel_stride = (out_channel_stride) ? out_channel_stride
+                                            : n_rows * out_row_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_channels * out_channel_stride;
+
+  // Perform the re-ordering
+  // For every batch
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    // For every row
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_i = in_batch + i*in_row_stride;
+      T* const out_i = out_batch + i*out_row_stride;
+
+      // For every column
+      for (int j = 0; j < n_cols; j++)
+      {
+        const T* const in_j = in_i + j*in_col_stride;
+        T* const out_j = out_i + j;
+
+        // For every channel
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_j + c;
+          T* const out_channel = out_j + c*out_channel_stride;
+          *(out_channel) = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+/*****************************************************************************/
+/* Generic weight re-order implementation.
+ */
+template <typename T>
+inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
+  const T* const in,  // Input in [Output x Input x Height x Width] form
+  T* const out,       // Output in [Height x Width x Input x Output] form
+  const int n_output_feature_maps,
+  const int n_input_feature_maps,
+  const int n_rows,
+  const int n_cols,
+  int in_output_feature_map_stride,
+  int in_input_feature_map_stride,
+  int in_row_stride,
+  int out_row_stride,
+  int out_col_stride,
+  int out_input_feature_map_stride
+)
+{
+  // Fill in stride values
+  in_row_stride = (in_row_stride)
+    ? in_row_stride
+    : n_cols;
+  in_input_feature_map_stride = (in_input_feature_map_stride)
+    ? in_input_feature_map_stride
+    : n_rows * in_row_stride;
+  in_output_feature_map_stride = (in_output_feature_map_stride)
+    ? in_output_feature_map_stride
+    : n_input_feature_maps * in_input_feature_map_stride;
+
+  out_input_feature_map_stride = (out_input_feature_map_stride)
+    ? out_input_feature_map_stride
+    : n_output_feature_maps;
+  out_col_stride = (out_col_stride)
+    ? out_col_stride
+    : n_input_feature_maps * out_input_feature_map_stride;
+  out_row_stride = (out_row_stride)
+    ? out_row_stride
+    : n_cols * out_col_stride;
+
+  // Perform the re-ordering
+  for (int i = 0; i < n_rows; i++)
+  {
+    const T* const in_row = in + i * in_row_stride;
+    T* out_row = out + i * out_row_stride;
+
+    for (int j = 0; j < n_cols; j++)
+    {
+      const T* const in_col = in_row + j;
+      T* const out_col = out_row + j * out_col_stride;
+
+      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
+      {
+        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
+        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
+
+        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
+        {
+          const T* const in_ofm = in_ifm + ofm * in_output_feature_map_stride;
+          T* const out_ofm = out_ifm + ofm;
+          *(out_ofm) = *(in_ofm);
+        }
+      }
+    }
+  }
+}
+
+/*****************************************************************************/
+/* Generic weight re-order implementation.
+ */
+template <typename T>
+inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
+  const T* const in,  // Input in [Height x Width x Input x Output] form
+  T* const out,       // Output in [Output x Input x Height x Width] form
+  const int n_rows,
+  const int n_cols,
+  const int n_input_feature_maps,
+  const int n_output_feature_maps,
+  int in_row_stride,
+  int in_col_stride,
+  int in_input_feature_map_stride,
+  int out_output_feature_map_stride,
+  int out_input_feature_map_stride,
+  int out_row_stride
+)
+{
+  // Fill in the stride values
+  in_input_feature_map_stride = (in_input_feature_map_stride)
+    ? in_input_feature_map_stride
+    : n_output_feature_maps;
+  in_col_stride = (in_col_stride)
+    ? in_col_stride
+    : n_input_feature_maps * in_input_feature_map_stride;
+  in_row_stride = (in_row_stride)
+    ? in_row_stride
+    : n_cols * in_col_stride;
+
+  out_row_stride = (out_row_stride)
+    ? out_row_stride
+    : n_cols;
+  out_input_feature_map_stride = (out_input_feature_map_stride)
+    ? out_input_feature_map_stride
+    : n_rows * out_row_stride;
+  out_output_feature_map_stride = (out_output_feature_map_stride)
+    ? out_output_feature_map_stride
+    : n_input_feature_maps * out_input_feature_map_stride;
+
+  // Perform the re-ordering
+  for (int i = 0; i < n_rows; i++)
+  {
+    const T* const in_row = in + i * in_row_stride;
+    T* const out_row = out + i * out_row_stride;
+
+    for (int j = 0; j < n_cols; j++)
+    {
+      const T* const in_col = in_row + j * in_col_stride;
+      T* const out_col = out_row + j;
+
+      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
+      {
+        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
+        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
+
+        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
+        {
+          const T* const in_ofm = in_ifm + ofm;
+          T* const out_ofm = out_ifm + ofm * out_output_feature_map_stride;
+          *(out_ofm) = *(in_ofm);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reorder
diff --git a/src/core/NEON/kernels/convolution/common/tensor.hpp b/src/core/NEON/kernels/convolution/common/tensor.hpp
new file mode 100644
index 0000000000..7738cdb349
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/tensor.hpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdlib>
+#include <random>
+
+#include "alloc.hpp"
+
+enum TensorOrder
+{
+  NHWC,  ///< [Batch x Height x Width x Channels]
+  NCHW,  ///< [Batch x Channels x Height x Width]
+};
+
+struct Tensor4DShape
+{
+  int n_batches, n_rows, n_cols, n_channels;
+  TensorOrder ordering;
+
+  // Create a new tensor with the default (NHWC) ordering
+  inline Tensor4DShape(
+    const int n_batches,
+    const int n_rows,
+    const int n_cols,
+    const int n_channels,
+    const TensorOrder ordering=NHWC
+  ) : n_batches(n_batches),
+      n_rows(n_rows),
+      n_cols(n_cols),
+      n_channels(n_channels),
+      ordering(ordering)
+  {
+  }
+
+  inline int index(const int n, const int i, const int j, const int c) const
+  {
+    if (this->ordering == NHWC)
+    {
+      return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c;
+    }
+    else  // NCHW
+    {
+      return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j;
+    }
+  }
+
+  inline int size() const
+  {
+    return n_batches * n_rows * n_cols * n_channels;
+  }
+
+  inline bool TestEq(const Tensor4DShape& other) const
+  {
+    return (n_batches == other.n_batches &&
+            n_rows == other.n_rows &&
+            n_cols == other.n_cols &&
+            n_channels == other.n_channels);
+  }
+};
+
+
+enum WeightOrder
+{
+  HWIO,  ///< [Height x Width x Input channels x Output channels]
+  OIHW,  ///< [Output channels x Input channels x Height x Width]
+};
+
+struct KernelShape
+{
+  int n_output_channels, n_rows, n_cols, n_input_channels;
+  WeightOrder ordering;
+
+  inline KernelShape(
+    const int n_output_channels,
+    const int n_rows,
+    const int n_cols,
+    const int n_input_channels,
+    const WeightOrder ordering=HWIO
+  ) : n_output_channels(n_output_channels),
+      n_rows(n_rows),
+      n_cols(n_cols),
+      n_input_channels(n_input_channels),
+      ordering(ordering)
+  {
+  }
+
+  inline int index(int oc, int i, int j, int ic) const
+  {
+    if (this->ordering == HWIO)
+    {
+      return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc;
+    }
+    else  // OIHW
+    {
+      return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j;
+    }
+  }
+
+  inline int size(void) const
+  {
+    return n_output_channels * n_rows * n_cols * n_input_channels;
+  }
+};
+
+
+template <typename ShapeT, typename T>
+class Tensor4D final
+{
+  public:
+    Tensor4D(ShapeT shape) :
+      shape(shape),
+      _data(reinterpret_cast<T*>(ALLOCATE(size_bytes())))
+    {
+        Clear();
+    }
+
+    Tensor4D(const Tensor4D<ShapeT, T>&) = delete;
+    Tensor4D operator=(const Tensor4D<ShapeT, T>&) = delete;
+
+    ~Tensor4D() {
+      free(_data);
+    }
+
+    inline T* ptr() const {
+      return _data;
+    }
+
+    inline size_t size_bytes() const {
+      return shape.size() * sizeof(T);
+    }
+
+    /* Extract an element of the tensor.
+     *
+     * If the shape is a Tensor4DShape then the index is given as batch, row,
+     * column and channel.  If the shape is a KernelShape then the index is
+     * given as output channel, row, column and input channel.
+     */
+    inline T& element(const int a, const int b, const int c, const int d) const
+    {
+      return _data[shape.index(a, b, c, d)];
+    }
+
+    inline void Clear() {
+      Fill(static_cast<T>(0));
+    }
+
+    inline void Fill(T val) {
+      for (int i = 0; i < shape.size(); i++)
+        _data[i] = val;
+    }
+
+    const ShapeT shape;
+
+  private:
+    T* const _data;
+};
diff --git a/src/core/NEON/kernels/convolution/common/tensor_utils.hpp b/src/core/NEON/kernels/convolution/common/tensor_utils.hpp
new file mode 100644
index 0000000000..82619f4799
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/tensor_utils.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "tensor.hpp"
+
+// Methods to print tensors and weights
+void PrintTensor(const Tensor4D<Tensor4DShape, float>& tensor);
+void PrintWeights(const Tensor4D<KernelShape, float>& weights);
+
+// Test the equivalence of two tensors
+// Counts the instances that |a - b|/|a| > max_err
+bool CmpTensors(
+  const Tensor4D<Tensor4DShape, float>& a,
+  const Tensor4D<Tensor4DShape, float>& b,
+  const float max_err=0.0f
+);
+
+// Fill the tensor with a test pattern
+void TestPattern(Tensor4D<Tensor4DShape, float>& tensor);
+void TestPattern(Tensor4D<KernelShape, float>& weights);
+
+// Fill the tensor with random values
+void Randomise(Tensor4D<Tensor4DShape, float>& tensor, const int seed=0);
+void Randomise(Tensor4D<KernelShape, float>& weights, const int seed=0);
diff --git a/src/core/NEON/kernels/convolution/common/utils.hpp b/src/core/NEON/kernels/convolution/common/utils.hpp
new file mode 100644
index 0000000000..b7a9517c65
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/common/utils.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017-2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <limits>
+
+void PrintMatrix(const float *const m, const int M, const int N, const int row_stride);
+
+constexpr inline int iceildiv(const int a, const int b)
+{
+    return (a + b - 1) / b;
+}
+
+template <typename T>
+inline T roundup(const T a, const T b)
+{
+    return b * iceildiv(a, b);
+}
+
+template<typename T>
+struct TypeBounds
+{
+    static constexpr T lower() noexcept { return std::numeric_limits<T>::has_infinity
+                                                 ? -std::numeric_limits<T>::infinity()
+                                                 : std::numeric_limits<T>::lowest(); };
+    static constexpr T upper() noexcept { return std::numeric_limits<T>::has_infinity
+                                                 ? std::numeric_limits<T>::infinity()
+                                                 : std::numeric_limits<T>::max(); };
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template<>
+struct TypeBounds<__fp16>
+{
+    static constexpr __fp16 lower() noexcept { return -std::numeric_limits<float>::infinity(); };
+    static constexpr __fp16 upper() noexcept { return std::numeric_limits<float>::infinity(); }
+};
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
new file mode 100644
index 0000000000..70d6689731
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2018-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <arm_neon.h>
+#include "activation.hpp"
+#include "padding.hpp"
+
+namespace depthwise
+{
+
+namespace nck = neon_convolution_kernels;
+
+class IDepthwiseConvolution
+{
+  public:
+    virtual ~IDepthwiseConvolution() = default;
+
+    virtual int output_size(
+      int dim_size,
+      unsigned int padding_before,
+      unsigned int padding_after
+    ) const = 0;
+
+    /* Set input tensor and stride. */
+    virtual void set_input(const void *inptr) = 0;
+    virtual void set_input(const void *inptr, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
+    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Set output tensor and stride. */
+    virtual void set_output(void *outptr) = 0;
+    virtual void set_output(void *outptr, int column_stride) = 0;
+    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
+    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
+
+    /* Weights and biases are re-ordered to improve memory access patterns. Use
+     * these methods to determine the size of the re-pack buffer and to set the
+     * address (and implicitly reorder the weights and biases into) the buffer.
+     */
+    virtual size_t get_packed_params_size(void) const = 0;
+    virtual void set_packed_params_buffer(void *) = 0;
+
+    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
+    virtual void pack_params(
+      void *buffer,
+      const void* weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const = 0;
+
+    /* Working space is used to pad tensors on the fly. Before running any
+     * inference check the amount of space required, allocate and provide a
+     * pointer to the convolution engine.
+     */
+    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
+    virtual void set_working_space(void *) = 0;
+
+    virtual unsigned int get_window(void) const = 0;
+    virtual void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) = 0;
+};
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut,
+  typename Derived
+>
+class DepthwiseConvolutionBase : public IDepthwiseConvolution
+{
+  public:
+    // Information about the specific convolution instance
+    using InputType = TIn;
+    using BiasType = TBias;
+    using OutputType = TOut;
+    static constexpr int output_tile_rows = OutputTileRows;
+    static constexpr int output_tile_cols = OutputTileCols;
+    static constexpr int kernel_rows = KernelRows;
+    static constexpr int kernel_cols = KernelCols;
+    static constexpr int stride_rows = StrideRows;
+    static constexpr int stride_cols = StrideCols;
+    static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
+    static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
+
+    /** Create a new depthwise convolution engine.
+     *
+     * @param[in] n_batches Number of batches tensors.
+     * @param[in] n_input_rows Number of rows in input tensor.
+     * @param[in] n_input_cols Number of columns in input tensor.
+     * @param[in] n_channels Number of channels in input and output tensors.
+     */
+    DepthwiseConvolutionBase(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    /** Create a new depthwise convolution engine.
+     *
+     * @param[in] n_batches Number of batches tensors.
+     * @param[in] n_input_rows Number of rows in input tensor.
+     * @param[in] n_input_cols Number of columns in input tensor.
+     * @param[in] n_channels Number of channels in input and output tensors.
+     */
+    DepthwiseConvolutionBase(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    // Cannot copy or move a DepthwiseConvolution.
+    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
+    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
+
+    /* Set input tensor and stride. */
+    void set_input(const void *inptr) override;
+    void set_input(const void *inptr, int column_stride) override;
+    void set_input(const void *inptr, int row_stride, int column_stride) override;
+    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /* Set output tensor and stride. */
+    void set_output(void *outptr) override;
+    void set_output(void *outptr, int column_stride) override;
+    void set_output(void *outptr, int row_stride, int column_stride) override;
+    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /** Get the number of output rows/columns.
+     *
+     * @param[in] dim_size Number of elements in the dimension (rows/columns)
+     * @param[in] same_padding True if the padding is SAME, otherwise false.
+     */
+    static int get_output_size(
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    );
+
+    int output_size(
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    ) const override;
+
+    /* Determine how much memory is required to store the packed weights and
+     * biases.
+     */
+    size_t get_packed_params_size(void) const override;
+
+    /* Set the buffer for the packed weights and biases, and perform the
+     * packing.
+     */
+    void set_packed_params_buffer(void *buffer) override;
+
+    void pack_params(const void *weights, const void *biases=nullptr) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      const void *biases=nullptr
+    ) const override;
+
+    void pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const override;
+
+    /** Query the amount of working space required.
+     * @param[in] The largest number of threads which will be used to execute
+     *            the kernel.
+     */
+    size_t get_working_space_size(unsigned int n_threads=1) const override;
+
+    /** Set the working space buffer.
+     */
+    void set_working_space(void *buffer) override;
+
+    /** Get the window of work to be performed by an instance of the operator.
+     */
+    unsigned int get_window(void) const override;
+
+    /** Perform a portion of the work associated with the operator.
+     *
+     * Will perform the window of work described by $[start, stop)$.
+     *
+     * @param[in] start Start of the window of work to perform.
+     * @param[in] stop End of the work to perform.
+     * @param[in] ID of the thread performing the work.
+     */
+    void run(
+      unsigned int start,
+      unsigned int stop,
+      unsigned int threadid=0
+    ) override;
+
+  protected:
+    /** Get the value to use to pad the tensor.
+     */
+    TIn _input_padding_value(void) const;
+
+    /** Implementation of the parameter packing.
+     */
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
+    /** Process a tile-row of the tensors.
+     */
+    void process_tile_row(
+      unsigned int threadid,
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
+      int row_pad_in_top,
+      int row_pad_in_left,
+      int row_pad_in_bottom,
+      int row_pad_out_bottom,
+      int n_tiles,
+      int n_input_cols,
+      int n_output_cols
+    );
+
+    /** Process a single tile of the tensor.
+     *
+     * This method will apply input/output padding (if required) and call the
+     * depthwise tile implementation.
+     */
+    void process_tile(
+      unsigned int threadid,
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptr,
+      OutputType* outptr,
+      int pad_in_top,
+      int pad_in_left,
+      int pad_in_bottom,
+      int pad_in_right,
+      int pad_out_bottom,
+      int pad_out_right
+    );
+
+    /** Perform depthwise convolution on a single tile.
+     */
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      OutputType* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptrs[inner_tile_rows][inner_tile_cols],
+      OutputType* outptrs[output_tile_rows][output_tile_cols]
+    );
+
+    int n_channels(void) const;
+
+  private:
+    // Member variables of instances of a convolution engine.
+    const InputType* _input;
+    OutputType* _output;
+    void* _packed_parameters;
+    void* _working_space;  // Per-thread working space
+    const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
+              _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
+    const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
+    const nck::ActivationFunction _activation;
+
+    // Stride information for a convolution instance
+    int _input_col_stride, _input_row_stride, _input_batch_stride;
+    int _output_col_stride, _output_row_stride, _output_batch_stride;
+
+    // Methods for getting access to working space
+    size_t _get_input_working_space_size(void) const;
+    size_t _get_output_working_space_size(void) const;
+
+    void *_get_input_working_space(unsigned int threadid) const;
+    void *_get_output_working_space(unsigned int threadid) const;
+};
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut
+>
+class DepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  TIn, TBias, TOut,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    TIn, TBias, TOut,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      TIn, TBias, TOut
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    using Base::DepthwiseConvolutionBase;
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const TIn* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      TOut* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+};
+
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float, float, float,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float, float,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float, float, float
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+};
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class DepthwiseConvolution<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t
+> : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  float16_t, float16_t, float16_t,
+  DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t
+  >
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t, float16_t,
+    DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      float16_t, float16_t, float16_t
+  > >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float16_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      float16_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+};
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
new file mode 100644
index 0000000000..1bae815613
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <deque>
+#include <functional>
+#include <memory>
+
+#include "depthwise.hpp"
+
+namespace depthwise
+{
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols,
+  typename TIn, typename TBias, typename TOut
+>
+class DilatedDepthwiseConvolution : public IDepthwiseConvolution
+{
+  public:
+    /** Create a new dilated depthwise convolution engine.
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    /** Create a new dilated depthwise convolution engine.
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    // Cannot copy or move a DilatedDepthwiseConvolution.
+    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
+    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
+
+    /* Set input tensor and stride. */
+    void set_input(const void *inptr) override;
+    void set_input(const void *inptr, int column_stride) override;
+    void set_input(const void *inptr, int row_stride, int column_stride) override;
+    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
+
+    /* Set output tensor and stride. */
+    void set_output(void *outptr) override;
+    void set_output(void *outptr, int column_stride) override;
+    void set_output(void *outptr, int row_stride, int column_stride) override;
+    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
+
+    static int get_output_size(
+      int dim_size,
+      unsigned int padding_before,
+      unsigned int padding_after,
+      int dilation_factor
+    );
+
+    int output_size(
+      int dim_size, unsigned int padding_before, unsigned int padding_after
+    ) const override;
+
+    /* Weights and biases are re-ordered to improve memory access patterns. Use
+     * these methods to determine the size of the re-pack buffer and to set the
+     * address (and implicitly reorder the weights and biases into) the buffer.
+     */
+    size_t get_packed_params_size(void) const override;
+    void set_packed_params_buffer(void *) override;
+
+    void pack_params(const void *weights, const void *biases=nullptr) const override;
+    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
+    void pack_params(
+      void *buffer,
+      const void* weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const override;
+
+    /* Working space is used to pad tensors on the fly. Before running any
+     * inference check the amount of space required, allocate and provide a
+     * pointer to the convolution engine.
+     */
+    size_t get_working_space_size(unsigned int nthreads=1) const override;
+    void set_working_space(void *) override;
+
+    unsigned int get_window(void) const override;
+    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
+
+  protected:
+    /** Protected constructor which also accepts a function to construct a new
+     * subconvolution
+     */
+    DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right,
+      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
+    );
+
+    const int _dilation_factor;
+    const int _n_input_rows, _n_input_cols, _n_channels;
+    const int _padding_top, _padding_left;
+    const int _n_output_rows, _n_output_cols;
+
+    /* Dilated depthwise convolution is performed through repeated calls to
+     * non-dilated convolutions. If the dilation factor is $n$, then we perform
+     * $(n + 1)^2$ depthwise convolutions.
+     */
+    using BaseDepthwise = DepthwiseConvolution<
+      OutputTileRows, OutputTileCols,
+      KernelRows, KernelCols,
+      StrideRows, StrideCols,
+      TIn, TBias, TOut
+    >;
+    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
+};
+
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
new file mode 100644
index 0000000000..4343f6ad45
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2018-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "depthwise.hpp"
+#include "qasymm8.hpp"
+#include "qsymm8.hpp"
+#pragma once
+
+using namespace neon_convolution_kernels;
+using namespace qasymm8;
+
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
+{
+  return vqrdmulhq_s32(a, b);
+}
+
+inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
+{
+  return vqrdmulhq_n_s32(a, b);
+}
+
+inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
+{
+  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
+}
+
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
+{
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+  const int32x4_t fixed = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed, shift);
+}
+
+inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
+{
+  const int32x4_t shift = vdupq_n_s32(-exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
+  const int32x4_t fixed = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed, shift);
+}
+
+inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
+{
+  const int32x2_t shift = vdup_n_s32(-exponent);
+  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
+  const int32x2_t fixed = vqadd_s32(x, fixup);
+  return vrshl_s32(fixed, shift);
+}
+
+inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
+{
+  const int32x2_t xs = vdup_n_s32(x);
+  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
+}
+
+namespace depthwise
+{
+
+namespace nck = neon_convolution_kernels;
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  uint8_t, int32_t, uint8_t,
+  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    uint8_t, int32_t, uint8_t,
+    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+  >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+    QAsymm8DepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  protected:
+    uint8_t _input_padding_value(void) const;
+
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      uint8_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+
+  private:
+    // Quantization parameters
+    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
+    const qasymm8::QAsymm8RescaleParams rescale_parameters;
+};
+
+template <
+  unsigned int OutputTileRows, unsigned int OutputTileCols,
+  unsigned int KernelRows, unsigned int KernelCols,
+  unsigned int StrideRows, unsigned int StrideCols
+>
+class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
+  OutputTileRows, OutputTileCols,
+  KernelRows, KernelCols,
+  StrideRows, StrideCols,
+  uint8_t, int32_t, uint8_t,
+  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+>
+{
+  using Base = DepthwiseConvolutionBase<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    uint8_t, int32_t, uint8_t,
+    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
+  >;
+  friend Base;
+  using InputType = typename Base::InputType;
+  using OutputType = typename Base::OutputType;
+
+  public:
+  QSymm8HybridPerChannelDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  QSymm8HybridPerChannelDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      nck::ActivationFunction activation,
+      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
+      const qasymm8::QAsymm8Params& input_quantisation,
+      const qasymm8::QAsymm8Params& output_quantisation,
+      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
+      unsigned int padding_top,
+      unsigned int padding_left,
+      unsigned int padding_bottom,
+      unsigned int padding_right
+    );
+
+  size_t get_packed_params_size(void) const override
+  {
+      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
+
+  }
+
+  protected:
+    uint8_t _input_padding_value(void) const;
+
+    void _pack_params(
+      void *buffer,
+      const void *weights,
+      unsigned int weight_row_stride,
+      unsigned int weight_col_stride,
+      const void *biases=nullptr
+    ) const;
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptr,
+      unsigned int in_row_stride,
+      unsigned int in_col_stride,
+      uint8_t* outptr,
+      unsigned int out_row_stride,
+      unsigned int out_col_stride
+    );
+
+    template <nck::ActivationFunction Activation>
+    void execute_tile(
+      int n_channels,
+      const void* packed_params,
+      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
+      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
+    );
+
+  private:
+    // Quantization parameters
+    const qsymm8::QSymm8PerChannelParams _weights_quant;
+    const qasymm8::QAsymm8Params _input_quant, _output_quant;
+    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
+};
+
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
new file mode 100644
index 0000000000..a11b0981c9
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include "depthwise_dilated.hpp"
+#include "depthwise_quantized.hpp"
+
+namespace depthwise {
+
+template <unsigned int OutputTileRows, unsigned int OutputTileCols,
+          unsigned int KernelRows, unsigned int KernelCols,
+          unsigned int StrideRows, unsigned int StrideCols>
+class QAsymm8DilatedDepthwiseConvolution
+    : public DilatedDepthwiseConvolution<
+          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
+          StrideCols, uint8_t, int32_t, uint8_t> {
+public:
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+
+  /** Create a new dilated depthwise convolution engine.
+   */
+  QAsymm8DilatedDepthwiseConvolution(
+      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
+      int dilation_factor, int n_output_rows, int n_output_cols,
+      nck::ActivationFunction activation,
+      const qasymm8::QAsymm8Params &weight_quantisation,
+      const qasymm8::QAsymm8Params &input_quantisation,
+      const qasymm8::QAsymm8Params &output_quantisation,
+      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
+      unsigned int padding_top, unsigned int padding_left,
+      unsigned int padding_bottom, unsigned int padding_right);
+};
+
+}  // namespace depthwise
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index d7ee70a1cd..59f5c6c6b3 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -25,10 +25,10 @@
 #ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "support/Requires.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index c1a1c2ecf0..414c128a27 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/Utils.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
new file mode 100644
index 0000000000..6880a6cb66
--- /dev/null
+++ b/src/core/helpers/AutoConfiguration.h
@@ -0,0 +1,176 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
+#define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
+ *
+ * @param[in,out] info              Tensor info used to check and assign.
+ * @param[in]     shape             New shape.
+ * @param[in]     num_channels      New number of channels.
+ * @param[in]     data_type         New data type
+ * @param[in]     quantization_info (Optional) New quantization info
+ *
+ * @return True if the tensor info has been initialized
+ */
+inline bool auto_init_if_empty(ITensorInfo       &info,
+                               const TensorShape &shape,
+                               int num_channels, DataType data_type,
+                               QuantizationInfo quantization_info = QuantizationInfo())
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_data_type(data_type);
+        info.set_num_channels(num_channels);
+        info.set_tensor_shape(shape);
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+
+/** Auto initialize the tensor info using another tensor info.
+*
+* @param info_sink   Tensor info used to check and assign
+* @param info_source Tensor info used to assign
+*
+* @return True if the tensor info has been initialized
+*/
+inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
+{
+    if(info_sink.tensor_shape().total_size() == 0)
+    {
+        info_sink.set_data_type(info_source.data_type());
+        info_sink.set_num_channels(info_source.num_channels());
+        info_sink.set_tensor_shape(info_source.tensor_shape());
+        info_sink.set_quantization_info(info_source.quantization_info());
+        info_sink.set_data_layout(info_source.data_layout());
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the shape to the specified value if the current assignment is empty.
+ *
+ * @param[in,out] info  Tensor info used to check and assign.
+ * @param[in]     shape New shape.
+ *
+ * @return True if the shape has been changed.
+ */
+inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_tensor_shape(shape);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the format, data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info   Tensor info used to check and assign.
+ * @param[in]     format New format.
+ *
+ * @return True if the format has been changed.
+ */
+inline bool set_format_if_unknown(ITensorInfo &info, Format format)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_format(format);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info      Tensor info used to check and assign.
+ * @param[in]     data_type New data type.
+ *
+ * @return True if the data type has been changed.
+ */
+inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_data_type(data_type);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the data layout to the specified value if
+ * the current data layout is unknown.
+ *
+ * @param[in,out] info        Tensor info used to check and assign.
+ * @param[in]     data_layout New data layout.
+ *
+ * @return True if the data type has been changed.
+ */
+inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
+{
+    if(info.data_layout() == DataLayout::UNKNOWN)
+    {
+        info.set_data_layout(data_layout);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the quantization info to the specified value if
+ * the current quantization info is empty and the data type of asymmetric quantized type
+ *
+ * @param[in,out] info              Tensor info used to check and assign.
+ * @param[in]     quantization_info Quantization info
+ *
+ * @return True if the quantization info has been changed.
+ */
+inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
+{
+    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    {
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_AUTOCONFIGURATION_H */
diff --git a/src/core/helpers/NormalizationHelpers.h b/src/core/helpers/NormalizationHelpers.h
new file mode 100644
index 0000000000..d94d5e3602
--- /dev/null
+++ b/src/core/helpers/NormalizationHelpers.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H
+#define SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Calculate the normalization dimension index for a given normalization type
+ *
+ * @param[in] layout Data layout of the input and output tensor
+ * @param[in] info   Normalization info
+ *
+ * @return Normalization dimension index
+ */
+inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
+    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
+
+    return info.is_in_map() ? width_idx : channel_idx;
+}
+} // namespace arm_compute
+#endif /* SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H */
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
new file mode 100644
index 0000000000..827bbef4cd
--- /dev/null
+++ b/src/core/helpers/ScaleHelpers.h
@@ -0,0 +1,331 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_SCALEHELPERS_H
+#define SRC_CORE_HELPERS_SCALEHELPERS_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/QuantizationInfo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace scale_helpers
+{
+/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+template <typename T>
+inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+    const T a10 = *(pixel_ptr + stride);
+    const T a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
+
+    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+}
+
+/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8 and in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ * @param[in] iq_info   Input QuantizationInfo
+ * @param[in] oq_info   Output QuantizationInfo
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
+                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = dequantize_qasymm8(*pixel_ptr, iq_info);
+    const float a01 = dequantize_qasymm8(*(pixel_ptr + 1), iq_info);
+    const float a10 = dequantize_qasymm8(*(pixel_ptr + stride), iq_info);
+    const float a11 = dequantize_qasymm8(*(pixel_ptr + stride + 1), iq_info);
+
+    const float w1  = dx1 * dy1;
+    const float w2  = dx * dy1;
+    const float w3  = dx1 * dy;
+    const float w4  = dx * dy;
+    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+    return static_cast<uint8_t>(quantize_qasymm8(res, oq_info));
+}
+
+/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8_SIGNED and in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ * @param[in] iq_info   Input QuantizationInfo
+ * @param[in] oq_info   Output QuantizationInfo
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
+                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = dequantize_qasymm8_signed(*pixel_ptr, iq_info);
+    const float a01 = dequantize_qasymm8_signed(*(pixel_ptr + 1), iq_info);
+    const float a10 = dequantize_qasymm8_signed(*(pixel_ptr + stride), iq_info);
+    const float a11 = dequantize_qasymm8_signed(*(pixel_ptr + stride + 1), iq_info);
+
+    const float w1  = dx1 * dy1;
+    const float w2  = dx * dy1;
+    const float w3  = dx1 * dy;
+    const float w4  = dx * dy;
+    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+    return static_cast<int8_t>(quantize_qasymm8_signed(res, oq_info));
+}
+
+/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom pixel value
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dy must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a10 = *(pixel_ptr + stride);
+
+    const float w1 = dy1;
+    const float w3 = dy;
+
+    return static_cast<T>(a00 * w1 + a10 * w3);
+}
+
+/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ *
+ * @note dx must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+
+    const float dx1 = 1.0f - dx;
+
+    const float w1 = dx1;
+    const float w2 = dx;
+
+    return static_cast<T>(a00 * w1 + a01 * w2);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation.
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
+ * @param[in] stride          Stride in bytes of the image;
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+template <typename T>
+inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = std::floor(x);
+    const int32_t yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+template <typename T>
+inline uint8_t
+pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    if(dx == 0.0f)
+    {
+        if(dy == 0.0f)
+        {
+            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
+        }
+        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride,
+                                 stride, dy);
+    }
+    if(dy == 0.0f)
+    {
+        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride,
+                                 dx);
+    }
+    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride,
+                             dx, dy);
+}
+
+/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @note The interpolation area depends on the width and height ration of the input and output images
+ * @note Currently average of the contributing pixels is calculated
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] wr              Width ratio among the input image width and output image width.
+ * @param[in] hr              Height ratio among the input image height and output image height.
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using area interpolation.
+ */
+inline uint8_t
+pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
+                      float hr, int x, int y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    // Calculate sampling position
+    float in_x = (x + 0.5f) * wr - 0.5f;
+    float in_y = (y + 0.5f) * hr - 0.5f;
+
+    // Get bounding box offsets
+    int x_from = std::floor(x * wr - 0.5f - in_x);
+    int y_from = std::floor(y * hr - 0.5f - in_y);
+    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
+    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
+
+    // Clamp position to borders
+    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
+    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
+
+    // Clamp bounding box offsets to borders
+    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
+    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
+    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
+    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
+
+    // Get pixel index
+    const int xi = std::floor(in_x);
+    const int yi = std::floor(in_y);
+
+    // Bounding box elements in each dimension
+    const int x_elements = (x_to - x_from + 1);
+    const int y_elements = (y_to - y_from + 1);
+    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
+
+    // Sum pixels in area
+    int sum = 0;
+    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    {
+        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
+        sum                = std::accumulate(ptr, ptr + x_elements, sum);
+    }
+
+    // Return average
+    return sum / (x_elements * y_elements);
+}
+} // namespace scale_helpers
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_SCALEHELPERS_H */
diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
new file mode 100644
index 0000000000..71b971af31
--- /dev/null
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -0,0 +1,45 @@
+/*
+* Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/SoftmaxHelpers.h"
+
+namespace arm_compute
+{
+namespace softmax_helpers
+{
+PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
+{
+    switch(axis)
+    {
+        case 1:
+            return PermutationVector(1U, 0U, 2U, 3U);
+        case 2:
+            return PermutationVector(2U, 1U, 0U, 3U);
+        case 3:
+            return PermutationVector(3U, 1U, 2U, 0U);
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+} // namespace softmax_helpers
+} // namespace arm_compute
diff --git a/src/core/helpers/SoftmaxHelpers.h b/src/core/helpers/SoftmaxHelpers.h
new file mode 100644
index 0000000000..de5490a14d
--- /dev/null
+++ b/src/core/helpers/SoftmaxHelpers.h
@@ -0,0 +1,50 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_SOFTMAXHELPERS_H
+#define SRC_CORE_HELPERS_SOFTMAXHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace softmax_helpers
+{
+/** Given a softmax axis, this function returns the permutation vector required to put the axis to the front
+ *
+ * @note This function assumes a tensor rank <= 4
+ *
+ * Axis selects the dimension on which softmax is performed.
+ * E.g. For input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+ * Interally softmax kernels is always performed on the first dimension (front dimension), therefore permutation is
+ * required to put the dimension specified by @p axis to the first dimension.
+ *
+ * @param[in] axis Axis on which to perform softmax. Supported: 1, 2, 3 (0 implies no permutation needed)
+ *
+ * @return the permutation vector
+ */
+PermutationVector get_permutation_vector_from_softmax_axis(size_t axis);
+} // namespace softmax_helpers
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_SOFTMAXHELPERS_H */
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
new file mode 100644
index 0000000000..3c3b2b93f9
--- /dev/null
+++ b/src/core/helpers/Utils.h
@@ -0,0 +1,97 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_UTILS_H
+#define SRC_CORE_HELPERS_UTILS_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+/** Create a strides object based on the provided strides and the tensor dimensions.
+ *
+ * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
+ * @param[in] stride_x      Stride to be used in X dimension (in bytes).
+ * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
+ *
+ * @return Strides object based on the specified strides. Missing strides are
+ *         calculated based on the tensor shape and the strides of lower dimensions.
+ */
+template <typename T, typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+{
+    const TensorShape &shape = info.tensor_shape();
+
+    // Create strides object
+    Strides strides(stride_x, fixed_strides...);
+
+    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    {
+        strides.set(i, shape[i - 1] * strides[i - 1]);
+    }
+
+    return strides;
+}
+
+/** Create a strides object based on the tensor dimensions.
+ *
+ * @param[in] info Tensor info object used to compute the strides.
+ *
+ * @return Strides object based on element size and tensor shape.
+ */
+template <typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info)
+{
+    return compute_strides(info, info.element_size());
+}
+
+/** Given an integer value, this function returns the next power of two
+ *
+ * @param[in] x Input value
+ *
+ * @return the next power of two
+ */
+inline unsigned int get_next_power_two(unsigned int x)
+{
+    // Decrement by 1
+    x--;
+
+    // Shift right by 1
+    x |= x >> 1u;
+    // Shift right by 2
+    x |= x >> 2u;
+    // Shift right by 4
+    x |= x >> 4u;
+    // Shift right by 8
+    x |= x >> 8u;
+    // Shift right by 16
+    x |= x >> 16u;
+
+    // Increment by 1
+    x++;
+
+    return x;
+}
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_UTILS_H */
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
new file mode 100644
index 0000000000..ba10eb9775
--- /dev/null
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -0,0 +1,183 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(!skip_border)
+    {
+        border_size = BorderSize(0);
+    }
+
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] + border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    if(anchor.num_dimensions() > 2)
+    {
+        window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+
+Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
+{
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // move the anchor to the start from the border
+                   anchor[0] - border_size.left,
+                   // move the anchor to include the right end border
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Include the border above the image
+                       anchor[1] - border_size.top,
+                       // Include the border below the image
+                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    if(anchor.num_dimensions() > 2)
+    {
+        window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+
+Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(skip_border)
+    {
+        border_size.top    = 0;
+        border_size.bottom = 0;
+    }
+    else
+    {
+        border_size.left  = 0;
+        border_size.right = 0;
+    }
+
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] - border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + shape[1] + border_size.bottom,
+                       1));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+} // namespace arm_compute
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
new file mode 100644
index 0000000000..9bc2135b6d
--- /dev/null
+++ b/src/core/helpers/WindowHelpers.h
@@ -0,0 +1,172 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_WINDOWHELPERS_H
+#define SRC_CORE_HELPERS_WINDOWHELPERS_H
+
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+/** Update window and padding size for each of the access patterns.
+ *
+ * First the window size is reduced based on all access patterns that are not
+ * allowed to modify the padding of the underlying tensor. Then the padding of
+ * the remaining tensors is increased to match the window.
+ *
+ * @param[in] win      Window that is used by the kernel.
+ * @param[in] patterns Access patterns used to calculate the final window and padding.
+ *
+ * @return True if the window has been changed. Changes to the padding do not
+ *         influence the returned value.
+ */
+template <typename... Ts>
+bool update_window_and_padding(Window &win, Ts &&... patterns)
+{
+    bool window_changed = false;
+
+    utility::for_each([&](const IAccessWindow & w)
+    {
+        window_changed |= w.update_window_if_needed(win);
+    },
+    patterns...);
+
+    bool padding_changed = false;
+
+    utility::for_each([&](IAccessWindow & w)
+    {
+        padding_changed |= w.update_padding_if_needed(win);
+    },
+    patterns...);
+
+    return window_changed;
+}
+
+/** Intersect multiple valid regions.
+ *
+ * @param[in] regions Valid regions.
+ *
+ * @return Intersection of all regions.
+ */
+template <typename... Ts>
+ValidRegion intersect_valid_regions(const Ts &... regions)
+{
+    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    {
+        ValidRegion region;
+
+        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        {
+            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
+        }
+
+        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        {
+            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
+        }
+
+        return region;
+    };
+
+    return utility::foldl(intersect, regions...);
+}
+
+#ifndef DOXYGEN_SKIP_THIS
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] skip_border  (Optional) If true exclude the border region from the window.
+ * @param[in] border_size  (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+{
+    return calculate_max_window(info.valid_region(), steps, skip_border, border_size);
+}
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] skip_border  (Optional) If true exclude the border region from the window.
+ * @param[in] border_size  (Optional) Border size. The border region will be excluded from the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+{
+    return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
+}
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] border_size  (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] border_size (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+{
+    return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
+}
+#endif /* DOXYGEN_SKIP_THIS */
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_WINDOWHELPERS_H */
diff --git a/src/core/utils/helpers/bit_ops.h b/src/core/utils/helpers/bit_ops.h
new file mode 100644
index 0000000000..ef60214c9f
--- /dev/null
+++ b/src/core/utils/helpers/bit_ops.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
+#define ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
+
+#include "support/Requires.h"
+
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace bit_ops
+{
+/** Checks if the idx-th bit is set in an integral type
+ *
+ * @param[in] v   Integral input
+ * @param[in] idx Index of the bit to check
+ *
+ * @return True if the idx-th bit is set else false
+ */
+template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+bool is_bit_set(T v, unsigned int idx)
+{
+    return (v & 1 << idx) != 0;
+}
+} // namespace bit_ops
+} // namespace helpers
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H */
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 4c2f8fa494..64633c643d 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/utils/helpers/fft.h"
+#include "src/core/utils/helpers/fft.h"
 
 #include <numeric>
 
diff --git a/src/core/utils/helpers/fft.h b/src/core/utils/helpers/fft.h
new file mode 100644
index 0000000000..f7b99dd7b8
--- /dev/null
+++ b/src/core/utils/helpers/fft.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_HELPERS_FFT_H
+#define ARM_COMPUTE_UTILS_HELPERS_FFT_H
+
+#include <set>
+#include <vector>
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace fft
+{
+/** Decompose a given 1D input size using the provided supported factors.
+ *
+ * @param[in] N                 Input size to be decomposed.
+ * @param[in] supported_factors Supported factors that can be used for decomposition.
+ *
+ * @return A vector with the stages of the decomposition. Will be empty if decomposition failed.
+ */
+std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsigned int> &supported_factors);
+/** Calculate digit reverse index vector given fft size and the decomposed stages
+ *
+ * @param N          Input size to calculate digit reverse for
+ * @param fft_stages A vector with the FFT decomposed stages
+ *
+ * @return A vector with the digit reverse indices. Will be empty if it failed.
+ */
+std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vector<unsigned int> &fft_stages);
+} // namespace fft
+} // namespace helpers
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_UTILS_HELPERS_FFT_H */
diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
new file mode 100644
index 0000000000..a475a23b59
--- /dev/null
+++ b/src/core/utils/helpers/float_ops.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
+#define ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace float_ops
+{
+union RawFloat
+{
+    /** Constructor
+     *
+     * @param[in] val Floating-point value
+     */
+    explicit RawFloat(float val)
+        : f32(val)
+    {
+    }
+    /** Extract sign of floating point number
+     *
+     * @return Sign of floating point number
+     */
+    int32_t sign() const
+    {
+        return i32 >> 31;
+    }
+    /** Extract exponent of floating point number
+     *
+     * @return Exponent of floating point number
+     */
+    int32_t exponent() const
+    {
+        return (i32 >> 23) & 0xFF;
+    }
+    /** Extract mantissa of floating point number
+     *
+     * @return Mantissa of floating point number
+     */
+    int32_t mantissa() const
+    {
+        return i32 & 0x007FFFFF;
+    }
+
+    int32_t i32;
+    float   f32;
+};
+
+/** Checks if two floating point numbers are equal given an allowed number of ULPs
+ *
+ * @param[in] a                First number to compare
+ * @param[in] b                Second number to compare
+ * @param[in] max_allowed_ulps (Optional) Number of allowed ULPs
+ *
+ * @return True if number is close else false
+ */
+inline bool is_equal_ulps(float a, float b, int max_allowed_ulps = 0)
+{
+    RawFloat ra(a);
+    RawFloat rb(b);
+
+    // Check ULP distance
+    const int ulps = std::abs(ra.i32 - rb.i32);
+    return ulps <= max_allowed_ulps;
+}
+
+/** Checks if the input floating point number is 1.0f checking if the difference is within a range defined with epsilon
+ *
+ * @param[in] a       Input floating point number
+ * @param[in] epsilon (Optional) Epsilon used to define the error bounds
+ *
+ * @return True if number is close to 1.0f
+ */
+inline bool is_one(float a, float epsilon = 0.00001f)
+{
+    return std::abs(1.0f - a) <= epsilon;
+}
+
+/** Checks if the input floating point number is 0.0f checking if the difference is within a range defined with epsilon
+ *
+ * @param[in] a       Input floating point number
+ * @param[in] epsilon (Optional) Epsilon used to define the error bounds
+ *
+ * @return True if number is close to 0.0f
+ */
+inline bool is_zero(float a, float epsilon = 0.00001f)
+{
+    return std::abs(0.0f - a) <= epsilon;
+}
+} // namespace float_ops
+} // namespace helpers
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H */
diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
new file mode 100644
index 0000000000..9279532e2a
--- /dev/null
+++ b/src/core/utils/helpers/tensor_info.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
+#define ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace tensor_info
+{
+/** Checks if the quantization info of given tensors are different
+ *
+ * @param tensor_info_1 Tensor info of the first tensor
+ * @param tensor_info_2 Tensor info of the second tensor
+ * @param tensor_infos  Tensor infos of the rest tensors
+ *
+ * @return True if tensors have mismatching quantization info else false.
+ */
+template <typename... Ts>
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+{
+    const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
+
+    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
+    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
+    {
+        return tensor_info->quantization_info() != first_quantization_info;
+    });
+}
+} // namespace tensor_info
+} // namespace helpers
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H */
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index 84302ea19f..f2216995a9 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 3647e13e92..3a69352471 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #include "arm_compute/graph/Graph.h"
 
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Iterable.h"
 
 #include <list>
 #include <stack>
@@ -185,4 +185,4 @@ std::vector<NodeID> dfs(Graph &g)
     return dfs_order_vector;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 5f2f46f72a..cd732553be 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index 8b2ecaf20e..8c1fedd93f 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index ada0d686ed..b97d25890a 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 8ecb593e11..7d9d388ebe 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -23,11 +23,11 @@
  */
 #include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 159e51246a..13a93a2556 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,8 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 61df45d8d0..95c6631830 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
@@ -33,6 +32,7 @@
 #include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "support/Cast.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute::utils::cast;
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 19c96eab50..63e8ff910f 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index c8fc3f1ae2..4393156e8a 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index fd16625780..b45f453f23 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -33,7 +33,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <algorithm>
 #include <map>
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index fa63f5625b..963b948432 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,8 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index e3d3812c1d..b7c551ce8b 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 48bb9f7fc0..09a3cf50c0 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 2a80825b36..1d47668cf2 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <set>
 
@@ -300,11 +300,12 @@ IGraphMutator::MutationType NodeFusionMutator::type() const
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations    = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
-                                                                  Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
-                                                                  Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
-                                                                  Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
-                                                                  Activation::SQUARE, Activation::TANH  };
+    const std::set<Activation> supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
+                                                               Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
+                                                               Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
+                                                               Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
+                                                               Activation::SQUARE, Activation::TANH
+                                                             };
 
     // Preconditions
     auto empty_prec = [](INode &)
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 359bba47ef..2c28a1a2d1 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -30,8 +30,8 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index dbbebdfb2b..532c0e821b 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp
@@ -29,7 +29,7 @@
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <set>
 
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index adfdc3c917..5f1842f76d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index efbc68f50e..a1743c56e6 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 2fc7f93adf..571e30931c 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 90d77883f6..f37fc779fe 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index ad6e7ba97b..57c4f685f6 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -24,13 +24,14 @@
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -47,7 +48,7 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
     DataType   output_data_type = DataType::S32;
     TensorInfo not_reshaped_output;
@@ -115,7 +116,7 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
 void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages  = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages  = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis = axis;
 
     const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
@@ -172,4 +173,4 @@ void CLArgMinMaxLayer::run()
     }
     _reshape.run();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 4214813446..2eb310b893 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -36,6 +36,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 4c787673b5..b291ae5b88 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -23,6 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 630352e4e6..85355f0f17 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 529f7bfb3e..6167e9de0a 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,6 +25,10 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index cd55336d9a..e6717b6d01 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index c1055dda36..07e7a18941 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -43,7 +43,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig
 }
 
 void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info,
+                                         const PadStrideInfo       &conv_info,
                                          const ActivationLayerInfo &act_info)
 {
     // Set GPU target
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 3515c25d82..0ffafa0221 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 7d15d33ab5..1269cba90d 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index 1def674bb6..4d0eab81ee 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -26,10 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6c0f1786f0..a89383ec31 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "support/MemorySupport.h"
+
 #include <utility>
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 4f365b6a61..75e87c382b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/Cast.h"
 #include "support/MemorySupport.h"
 
 #include <algorithm>
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index d56b341abf..ccae6713a6 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,10 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -35,12 +32,18 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "support/Cast.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ee90b39c2b..e871b39805 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,11 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/Cast.h"
 
 #include <cmath>
 #include <memory>
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 30dce5b8fe..7a8de6c1f5 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -35,7 +33,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 45dc402449..5291de074a 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index fce1fe43a2..4a60ee9d08 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index e30b1dbb86..76a531b1c9 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 1907c7cc08..fefbff639d 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 CLPriorBoxLayer::CLPriorBoxLayer()
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 15a54c7928..c493471667 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 0e2ede7167..4ea7f7642f 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,12 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 54e91fb8d8..208371c45d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -30,7 +30,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -47,7 +49,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    const unsigned int num_of_stages       = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
     const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
     const bool         is_reshape_required = !keep_dims;
 
@@ -194,7 +196,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
     _is_reshape_required = !keep_dims;
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index 60b72c5f87..1e3d614402 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -42,7 +42,7 @@ void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTenso
 
 void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
                         BorderMode border_mode,
-                        uint8_t constant_border_value)
+                        uint8_t    constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c7d7df75d2..ef8010847b 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 720f9111a5..759c8706a1 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
 
 namespace arm_compute
 {
@@ -63,7 +64,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     {
         _memory_group.manage(&_input_permuted);
         _memory_group.manage(&_output_permuted);
-        _permute_input.configure(compile_context, input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(compile_context, input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
         tmp_output = &_output_permuted;
     }
 
@@ -99,7 +100,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     _sum.allocator()->allocate();
     if(_needs_permute)
     {
-        _permute_output.configure(compile_context, &_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(compile_context, &_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
         _input_permuted.allocator()->allocate();
         _output_permuted.allocator()->allocate();
     }
@@ -117,7 +118,7 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     const bool   needs_permute = actual_axis != 0;
     if(needs_permute)
     {
-        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
         TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &input_permuted, permutation_vector));
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index db0b14b9a2..0b27371e3f 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 09a35a6f27..7ad017f918 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -102,7 +102,7 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 }
 
 void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info,
+                                           const PadStrideInfo       &conv_info,
                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Get indices for the width and height
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
new file mode 100644
index 0000000000..f6fad7e4ff
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTION_H
+#define SRC_CLGEMMKERNELSELECTION_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMKernelSelection factory class */
+class CLGEMMKernelSelectionFactory final
+{
+public:
+    /** Static method to select the GEMM kernel accordingly with the GPU target and GEMM's dimensionality
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMKernelSelection class
+     */
+    static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionMidgard>(gpu);
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTION_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
index b1dd690ca5..73b90568f5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
new file mode 100644
index 0000000000..a495b48301
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONBIFROST_H
+#define SRC_CLGEMMKERNELSELECTIONBIFROST_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionBifrost final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONBIFROST_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
index 324c2f7dca..d172a827b5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
new file mode 100644
index 0000000000..3f6003f7dc
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONMIDGARD_H
+#define SRC_CLGEMMKERNELSELECTIONMIDGARD_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionMidgard final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionMidgard(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONMIDGARD_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
index c50c7ae76b..acae0e7565 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
new file mode 100644
index 0000000000..cbea9ea548
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONVALHALL_H
+#define SRC_CLGEMMKERNELSELECTIONVALHALL_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionValhall final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONVALHALL_H */
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 52644bf192..a6474c9835 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
index e49e15508b..58b0d579d2 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index f017006de7..e6b0ec20b8 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,7 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
+#include "support/MemorySupport.h"
 #include "support/Mutex.h"
 
 #include <atomic>
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 9d62733384..fdb4c9f0f6 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 3507a3ac45..31f1fafd69 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 4d6caaee01..a7dd464540 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/Error.h"
@@ -352,6 +352,10 @@ int get_max_cpus()
 
 namespace arm_compute
 {
+namespace utils
+{
+namespace cpu
+{
 void get_cpu_configuration(CPUInfo &cpuinfo)
 {
 #if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
@@ -460,5 +464,6 @@ unsigned int get_threads_hint()
 
     return num_threads_hint;
 }
-
+} // namespace cpu
+} // namespace utils
 } // namespace arm_compute
diff --git a/src/runtime/CPUUtils.h b/src/runtime/CPUUtils.h
new file mode 100644
index 0000000000..452d3d58ca
--- /dev/null
+++ b/src/runtime/CPUUtils.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_CPU_UTILS_H
+#define ARM_COMPUTE_RUNTIME_CPU_UTILS_H
+
+namespace arm_compute
+{
+class CPUInfo;
+
+namespace utils
+{
+namespace cpu
+{
+/** This function will try to detect the CPU configuration on the system and will fill
+ *  the cpuinfo object accordingly to reflect this.
+ *
+ * @param[out] cpuinfo @ref CPUInfo to be used to hold the system's cpu configuration.
+ */
+void get_cpu_configuration(CPUInfo &cpuinfo);
+/** Some systems have both big and small cores, this fuction computes the minimum number of cores
+ *  that are exactly the same on the system. To maximize performance the library attempts to process
+ *  workloads concurrently using as many threads as big cores are available on the system.
+ *
+ * @return The minumum number of common cores.
+ */
+unsigned int get_threads_hint();
+} // namespace cpu
+} // namespace utils
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_CPU_UTILS_H */
diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/DeviceProperties.cpp
index 5d7ae020d7..ec9f4a16ed 100644
--- a/src/runtime/DeviceProperties.cpp
+++ b/src/runtime/DeviceProperties.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/runtime/DeviceProperties.h"
 
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 namespace arm_compute
 {
 DeviceProperties::DeviceProperties()
 {
-    get_cpu_configuration(cpu_info);
+    utils::cpu::get_cpu_configuration(cpu_info);
 }
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index 998f8a5cc4..4d74555f4e 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
index 9e23974b8d..807412eb17 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 53df3699b0..43df3d5e23 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CPUUtils.h"
-#include "arm_compute/runtime/SchedulerUtils.h"
+#include "src/runtime/CPUUtils.h"
+#include "src/runtime/SchedulerUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
     : _cpu_info()
 {
-    get_cpu_configuration(_cpu_info);
+    utils::cpu::get_cpu_configuration(_cpu_info);
     // Work out the best possible number of execution threads
-    _num_threads_hint = get_threads_hint();
+    _num_threads_hint = utils::cpu::get_threads_hint();
 }
 
 CPUInfo &IScheduler::cpu_info()
@@ -74,7 +74,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensor
 
         //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
         unsigned m_threads, n_threads;
-        std::tie(m_threads, n_threads) = split_2d(this->num_threads(), m, n);
+        std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
 
         std::vector<IScheduler::Workload> workloads;
         for(unsigned int ni = 0; ni != n_threads; ++ni)
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 82316c49c6..f2181e0a74 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -36,6 +36,6 @@ INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
 
 void INESimpleFunctionNoBorder::run()
 {
-    schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
+    utils::schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 0664d3c9d5..70bbba62ad 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index 5a593e9c74..eab40ac5be 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 NEBatchNormalizationLayer::NEBatchNormalizationLayer()
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index c06a8aa0e0..2705cffe68 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 8df4f4cb62..72bd9e6b19 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -35,6 +35,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index f6ed2ec250..f8f99169aa 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -25,6 +25,8 @@
 
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
 
+#include "support/MemorySupport.h"
+
 #include <cstddef>
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index dff3070239..cb9ab168a7 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index e363f89482..0aaa37ec92 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 744a91521f..2c53b185df 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index cd68788145..a46fc9f45f 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 4dcf41e360..d956d16f4d 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <algorithm>
 #include <cmath>
 
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 3b8ca44ed7..4166cff97a 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -34,6 +33,8 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index ad349cb635..5b0848398d 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
-#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
 #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
+
+#include "support/MemorySupport.h"
 
 #include <arm_neon.h>
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 83db146a8a..36357dde41 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -33,6 +33,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 3d5377892a..13210a06cd 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 11989d3225..7610d15787 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 #include <memory>
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 21c349ba95..03c597a3bf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -27,6 +27,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index fda130bf69..bcf6bef9c7 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 5a6b51337a..95f20ae1a9 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 021f7b530a..c3c5529c09 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,11 +23,12 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 91176bfa45..4938a56b3f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 2278f07a1c..bbf8343c2b 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -30,12 +30,14 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
 #include "src/core/utils/ScaleUtils.h"
 
+#include "support/MemorySupport.h"
+#include "support/Rounding.h"
+
 #include <cmath>
 #include <cstddef>
 #include <utility>
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
index b0cafae520..d165b2235c 100644
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+#include "src/runtime/NEON/functions/NESimpleAssemblyFunction.h"
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
new file mode 100644
index 0000000000..e9be54d35f
--- /dev/null
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
+#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
+
+#include "arm_compute/runtime/IFunction.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
+class NESimpleAssemblyFunction : public IFunction
+{
+public:
+    /** Constructor */
+    NESimpleAssemblyFunction();
+
+    /** Configure the function with the kernel to run
+     *
+     * @param[in] kernel GEMM Wrapper kernel configured and ready to run
+     *
+     * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
+     */
+    void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
+};
+} //namespace arm_compute
+#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e763caa3a3..4f773861d2 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
 
 namespace arm_compute
 {
@@ -53,7 +54,7 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
         // Add to the memory manager _input_permuted
         _memory_group.manage(&_input_permuted);
 
-        _permute_input.configure(input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
     // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -87,7 +88,7 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
         _input_permuted.allocator()->allocate();
 
         // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(&_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
 
         // Allocate the intermediate permuted tensors
         _output_permuted.allocator()->allocate();
@@ -128,7 +129,7 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 
     if(needs_permute)
     {
-        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
         TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &input_permuted, permutation_vector));
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 1bad310640..23b9f60c38 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -23,17 +23,17 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "support/MemorySupport.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 73a7caac8b..11e89cb23b 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -24,18 +24,21 @@
 
 #include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <set>
 
 namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 4c2f03a53a..bf34b0114b 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 #include <omp.h>
 
 namespace arm_compute
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 1c12e3ce58..6f9a32c879 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/runtime/SchedulerUtils.h"
 
 #include "arm_compute/core/Error.h"
 
@@ -28,6 +29,8 @@
 
 namespace arm_compute
 {
+namespace scheduler_utils
+{
 #ifndef BARE_METAL
 std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
 {
@@ -76,4 +79,5 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
     }
 }
 #endif /* #ifndef BARE_METAL */
+} // namespace scheduler_utils
 } // namespace arm_compute
diff --git a/src/runtime/SchedulerUtils.h b/src/runtime/SchedulerUtils.h
new file mode 100644
index 0000000000..46644a369e
--- /dev/null
+++ b/src/runtime/SchedulerUtils.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_COMPUTE_SCHEDULER_UTILS_H
+#define SRC_COMPUTE_SCHEDULER_UTILS_H
+
+#include <cstddef>
+#include <utility>
+
+namespace arm_compute
+{
+namespace scheduler_utils
+{
+/** Given two dimensions and a maximum number of threads to utilise, calculate the best
+ * combination of threads that fit in (multiplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n);
+} // namespace scheduler_utils
+} // namespace arm_compute
+#endif /* SRC_COMPUTE_SCHEDULER_UTILS_H */
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 534b421f8a..15e9d43a49 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+namespace utils
+{
 #ifndef DOXYGEN_SKIP_THIS
 static const std::string information =
 #include "arm_compute_version.embed"
@@ -78,4 +80,5 @@ unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, un
     const unsigned int num_of_stages = num_of_wg / 128 + 2;
     return num_of_stages;
 }
+} // namespace utils
 } // namespace arm_compute
diff --git a/src/runtime/Utils.h b/src/runtime/Utils.h
new file mode 100644
index 0000000000..f8775c9612
--- /dev/null
+++ b/src/runtime/Utils.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_UTILS_H
+#define SRC_RUNTIME_UTILS_H
+
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace utils
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+
+/** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow.
+ *
+ * @param[in] ctx    Context to use.
+ * @param[in] kernel Kernel to schedule.
+ * @param[in] hints  Hints to use.
+ */
+void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints);
+
+/** Calculate number of stages for parallel implementations
+ *
+ * @param[in] input_x_dimension input tensor x dimension
+ * @param[in] axis              axis to be used
+ */
+unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis);
+} // namespace utils
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_UTILS_H */
-- 
cgit v1.2.1