From 6ff3b19ee6120edf015fad8caab2991faa3070af Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Mon, 4 Sep 2017 18:44:23 +0100
Subject: COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
---
 arm_compute/core/AccessWindowAutoPadding.h         |   76 ++
 arm_compute/core/AccessWindowStatic.h              |   92 ++
 arm_compute/core/AccessWindowTranspose.h           |   48 +
 arm_compute/core/CL/CLHelpers.h                    |  105 ++
 arm_compute/core/CL/CLKernelLibrary.h              |  248 +++++
 arm_compute/core/CL/CLKernels.h                    |   90 ++
 arm_compute/core/CL/CLTypes.h                      |   41 +
 arm_compute/core/CL/ICLArray.h                     |  118 +++
 arm_compute/core/CL/ICLDistribution1D.h            |  102 ++
 arm_compute/core/CL/ICLHOG.h                       |  113 +++
 arm_compute/core/CL/ICLKernel.h                    |  157 +++
 arm_compute/core/CL/ICLLut.h                       |   94 ++
 arm_compute/core/CL/ICLMultiHOG.h                  |   56 ++
 arm_compute/core/CL/ICLMultiImage.h                |   58 ++
 arm_compute/core/CL/ICLSimple2DKernel.h            |   41 +
 arm_compute/core/CL/ICLSimple3DKernel.h            |   43 +
 arm_compute/core/CL/ICLSimpleKernel.h              |   66 ++
 arm_compute/core/CL/ICLTensor.h                    |  106 ++
 arm_compute/core/CL/OpenCL.h                       |   43 +
 .../core/CL/kernels/CLAbsoluteDifferenceKernel.h   |   71 ++
 arm_compute/core/CL/kernels/CLAccumulateKernel.h   |   91 ++
 .../core/CL/kernels/CLActivationLayerKernel.h      |   46 +
 .../core/CL/kernels/CLArithmeticAdditionKernel.h   |   72 ++
 .../CL/kernels/CLArithmeticSubtractionKernel.h     |   74 ++
 .../CL/kernels/CLBatchNormalizationLayerKernel.h   |   77 ++
 arm_compute/core/CL/kernels/CLBitwiseAndKernel.h   |   68 ++
 arm_compute/core/CL/kernels/CLBitwiseNotKernel.h   |   49 +
 arm_compute/core/CL/kernels/CLBitwiseOrKernel.h    |   68 ++
 arm_compute/core/CL/kernels/CLBitwiseXorKernel.h   |   68 ++
 arm_compute/core/CL/kernels/CLBox3x3Kernel.h       |   51 +
 arm_compute/core/CL/kernels/CLCannyEdgeKernel.h    |  147 +++
 .../core/CL/kernels/CLChannelCombineKernel.h       |   83 ++
 .../core/CL/kernels/CLChannelExtractKernel.h       |   79 ++
 arm_compute/core/CL/kernels/CLCol2ImKernel.h       |   86 ++
 arm_compute/core/CL/kernels/CLColorConvertKernel.h |   90 ++
 arm_compute/core/CL/kernels/CLConvolutionKernel.h  |  182 ++++
 .../core/CL/kernels/CLDepthConcatenateKernel.h     |   76 ++
 arm_compute/core/CL/kernels/CLDepthConvertKernel.h |   61 ++
 arm_compute/core/CL/kernels/CLDerivativeKernel.h   |   72 ++
 arm_compute/core/CL/kernels/CLDilateKernel.h       |   51 +
 arm_compute/core/CL/kernels/CLErodeKernel.h        |   51 +
 arm_compute/core/CL/kernels/CLFastCornersKernel.h  |  114 +++
 arm_compute/core/CL/kernels/CLFillBorderKernel.h   |   77 ++
 .../core/CL/kernels/CLGEMMInterleave4x4Kernel.h    |   80 ++
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h    |   81 ++
 .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h   |   63 ++
 .../core/CL/kernels/CLGEMMMatrixAdditionKernel.h   |   70 ++
 .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h   |   73 ++
 .../core/CL/kernels/CLGEMMTranspose1xWKernel.h     |   84 ++
 arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h  |   51 +
 arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h  |   67 ++
 .../core/CL/kernels/CLGaussianPyramidKernel.h      |  100 ++
 .../core/CL/kernels/CLHOGDescriptorKernel.h        |  105 ++
 arm_compute/core/CL/kernels/CLHOGDetectorKernel.h  |   82 ++
 .../core/CL/kernels/CLHarrisCornersKernel.h        |   85 ++
 arm_compute/core/CL/kernels/CLHistogramKernel.h    |   98 ++
 arm_compute/core/CL/kernels/CLIm2ColKernel.h       |  111 +++
 .../core/CL/kernels/CLIntegralImageKernel.h        |   73 ++
 arm_compute/core/CL/kernels/CLLKTrackerKernel.h    |  183 ++++
 .../CLLocallyConnectedMatrixMultiplyKernel.h       |   68 ++
 .../core/CL/kernels/CLMagnitudePhaseKernel.h       |   77 ++
 arm_compute/core/CL/kernels/CLMeanStdDevKernel.h   |   74 ++
 arm_compute/core/CL/kernels/CLMedian3x3Kernel.h    |   51 +
 .../core/CL/kernels/CLMinMaxLocationKernel.h       |  104 ++
 .../core/CL/kernels/CLNonLinearFilterKernel.h      |   63 ++
 .../CL/kernels/CLNonMaximaSuppression3x3Kernel.h   |   52 +
 .../core/CL/kernels/CLNormalizationLayerKernel.h   |   71 ++
 .../CL/kernels/CLPixelWiseMultiplicationKernel.h   |   73 ++
 arm_compute/core/CL/kernels/CLPoolingLayerKernel.h |   69 ++
 arm_compute/core/CL/kernels/CLRemapKernel.h        |   70 ++
 arm_compute/core/CL/kernels/CLScaleKernel.h        |   55 ++
 arm_compute/core/CL/kernels/CLScharr3x3Kernel.h    |   86 ++
 arm_compute/core/CL/kernels/CLSobel3x3Kernel.h     |   72 ++
 arm_compute/core/CL/kernels/CLSobel5x5Kernel.h     |  116 +++
 arm_compute/core/CL/kernels/CLSobel7x7Kernel.h     |  116 +++
 arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h |  109 +++
 arm_compute/core/CL/kernels/CLTableLookupKernel.h  |   47 +
 arm_compute/core/CL/kernels/CLThresholdKernel.h    |   56 ++
 arm_compute/core/CL/kernels/CLTransposeKernel.h    |   49 +
 arm_compute/core/CL/kernels/CLWarpAffineKernel.h   |   51 +
 .../core/CL/kernels/CLWarpPerspectiveKernel.h      |   51 +
 .../core/CL/kernels/CLWeightsReshapeKernel.h       |  114 +++
 arm_compute/core/CPP/CPPKernels.h                  |   32 +
 arm_compute/core/CPP/ICPPKernel.h                  |   53 +
 arm_compute/core/CPP/ICPPSimpleKernel.h            |   66 ++
 .../core/CPP/kernels/CPPCornerCandidatesKernel.h   |   74 ++
 .../CPPDetectionWindowNonMaximaSuppressionKernel.h |   72 ++
 .../CPP/kernels/CPPSortEuclideanDistanceKernel.h   |   70 ++
 arm_compute/core/Coordinates.h                     |   61 ++
 arm_compute/core/Dimensions.h                      |  178 ++++
 arm_compute/core/Error.h                           |  160 +++
 arm_compute/core/FixedPoint.h                      |  217 +++++
 arm_compute/core/FixedPoint.inl                    |  252 +++++
 arm_compute/core/HOGInfo.h                         |  146 +++
 arm_compute/core/Helpers.h                         |  507 ++++++++++
 arm_compute/core/Helpers.inl                       |  306 ++++++
 arm_compute/core/IAccessWindow.h                   |  225 +++++
 arm_compute/core/IArray.h                          |  149 +++
 arm_compute/core/IDistribution.h                   |   59 ++
 arm_compute/core/IDistribution1D.h                 |   84 ++
 arm_compute/core/IHOG.h                            |   54 ++
 arm_compute/core/IKernel.h                         |   72 ++
 arm_compute/core/ILut.h                            |   69 ++
 arm_compute/core/IMultiHOG.h                       |   61 ++
 arm_compute/core/IMultiImage.h                     |   60 ++
 arm_compute/core/IPyramid.h                        |   56 ++
 arm_compute/core/ITensor.h                         |   90 ++
 arm_compute/core/ITensorInfo.h                     |  195 ++++
 arm_compute/core/MultiImageInfo.h                  |   66 ++
 arm_compute/core/NEON/INEKernel.h                  |   33 +
 arm_compute/core/NEON/INESimpleKernel.h            |   33 +
 arm_compute/core/NEON/NEColorConvertHelper.inl     |  888 +++++++++++++++++
 arm_compute/core/NEON/NEFixedPoint.h               |  686 +++++++++++++
 arm_compute/core/NEON/NEFixedPoint.inl             | 1018 ++++++++++++++++++++
 arm_compute/core/NEON/NEKernels.h                  |   96 ++
 arm_compute/core/NEON/NEMath.h                     |   96 ++
 arm_compute/core/NEON/NEMath.inl                   |  141 +++
 .../core/NEON/kernels/NEAbsoluteDifferenceKernel.h |   82 ++
 arm_compute/core/NEON/kernels/NEAccumulateKernel.h |  122 +++
 .../core/NEON/kernels/NEActivationLayerKernel.h    |   84 ++
 .../core/NEON/kernels/NEArithmeticAdditionKernel.h |   79 ++
 .../NEON/kernels/NEArithmeticSubtractionKernel.h   |   79 ++
 .../NEON/kernels/NEBatchNormalizationLayerKernel.h |   78 ++
 arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h |   68 ++
 arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h |   66 ++
 arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h  |   68 ++
 arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h |   68 ++
 arm_compute/core/NEON/kernels/NEBox3x3Kernel.h     |   62 ++
 arm_compute/core/NEON/kernels/NECannyEdgeKernel.h  |  190 ++++
 .../core/NEON/kernels/NEChannelCombineKernel.h     |  125 +++
 .../core/NEON/kernels/NEChannelExtractKernel.h     |  109 +++
 arm_compute/core/NEON/kernels/NECol2ImKernel.h     |  100 ++
 .../core/NEON/kernels/NEColorConvertKernel.h       |   88 ++
 .../core/NEON/kernels/NEConvolutionKernel.h        |  251 +++++
 .../NEON/kernels/NECumulativeDistributionKernel.h  |   80 ++
 .../core/NEON/kernels/NEDepthConcatenateKernel.h   |   76 ++
 .../core/NEON/kernels/NEDepthConvertKernel.h       |   68 ++
 arm_compute/core/NEON/kernels/NEDerivativeKernel.h |   94 ++
 arm_compute/core/NEON/kernels/NEDilateKernel.h     |   49 +
 .../NEDirectConvolutionLayerBiasAccumulateKernel.h |   74 ++
 .../NEON/kernels/NEDirectConvolutionLayerKernel.h  |   76 ++
 arm_compute/core/NEON/kernels/NEErodeKernel.h      |   49 +
 .../core/NEON/kernels/NEFastCornersKernel.h        |   72 ++
 arm_compute/core/NEON/kernels/NEFillArrayKernel.h  |   73 ++
 arm_compute/core/NEON/kernels/NEFillBorderKernel.h |   79 ++
 .../core/NEON/kernels/NEFillInnerBorderKernel.h    |   75 ++
 .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h  |   79 ++
 .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h  |   88 ++
 .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h   |   63 ++
 .../core/NEON/kernels/NEGEMMMatrixAdditionKernel.h |   81 ++
 .../core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h |   75 ++
 .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h   |   82 ++
 .../core/NEON/kernels/NEGaussian3x3Kernel.h        |   50 +
 .../core/NEON/kernels/NEGaussian5x5Kernel.h        |   73 ++
 .../core/NEON/kernels/NEGaussianPyramidKernel.h    |  100 ++
 .../core/NEON/kernels/NEHOGDescriptorKernel.h      |  141 +++
 .../core/NEON/kernels/NEHOGDetectorKernel.h        |   87 ++
 .../core/NEON/kernels/NEHarrisCornersKernel.h      |  126 +++
 arm_compute/core/NEON/kernels/NEHistogramKernel.h  |  129 +++
 arm_compute/core/NEON/kernels/NEIm2ColKernel.h     |  114 +++
 .../core/NEON/kernels/NEIntegralImageKernel.h      |   50 +
 arm_compute/core/NEON/kernels/NELKTrackerKernel.h  |  144 +++
 .../NELocallyConnectedMatrixMultiplyKernel.h       |   64 ++
 .../core/NEON/kernels/NEMagnitudePhaseKernel.h     |  164 ++++
 arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h |   76 ++
 arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h  |   50 +
 .../core/NEON/kernels/NEMinMaxLocationKernel.h     |  161 ++++
 .../core/NEON/kernels/NENonLinearFilterKernel.h    |  147 +++
 .../NEON/kernels/NENonMaximaSuppression3x3Kernel.h |   99 ++
 .../core/NEON/kernels/NENormalizationLayerKernel.h |  106 ++
 .../NEON/kernels/NEPixelWiseMultiplicationKernel.h |  105 ++
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  106 ++
 arm_compute/core/NEON/kernels/NERemapKernel.h      |   78 ++
 arm_compute/core/NEON/kernels/NEScaleKernel.h      |   89 ++
 arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h  |   82 ++
 arm_compute/core/NEON/kernels/NESobel3x3Kernel.h   |   82 ++
 arm_compute/core/NEON/kernels/NESobel5x5Kernel.h   |  118 +++
 arm_compute/core/NEON/kernels/NESobel7x7Kernel.h   |  122 +++
 .../core/NEON/kernels/NESoftmaxLayerKernel.h       |  135 +++
 .../core/NEON/kernels/NETableLookupKernel.h        |   76 ++
 arm_compute/core/NEON/kernels/NEThresholdKernel.h  |   81 ++
 arm_compute/core/NEON/kernels/NETransposeKernel.h  |   78 ++
 arm_compute/core/NEON/kernels/NEWarpKernel.h       |  117 +++
 .../core/NEON/kernels/NEWeightsReshapeKernel.h     |   94 ++
 arm_compute/core/PixelValue.h                      |  168 ++++
 arm_compute/core/PyramidInfo.h                     |  131 +++
 arm_compute/core/Size2D.h                          |   84 ++
 arm_compute/core/Steps.h                           |   66 ++
 arm_compute/core/Strides.h                         |   62 ++
 arm_compute/core/SubTensorInfo.h                   |  184 ++++
 arm_compute/core/TensorInfo.h                      |  300 ++++++
 arm_compute/core/TensorShape.h                     |  141 +++
 arm_compute/core/Types.h                           |  636 ++++++++++++
 arm_compute/core/Utils.h                           |  740 ++++++++++++++
 arm_compute/core/Validate.h                        |  563 +++++++++++
 arm_compute/core/Window.h                          |  355 +++++++
 arm_compute/core/Window.inl                        |  182 ++++
 arm_compute/runtime/Array.h                        |   75 ++
 arm_compute/runtime/CL/CLArray.h                   |  108 +++
 arm_compute/runtime/CL/CLDistribution1D.h          |   79 ++
 arm_compute/runtime/CL/CLFunctions.h               |   94 ++
 arm_compute/runtime/CL/CLHOG.h                     |   80 ++
 arm_compute/runtime/CL/CLLut.h                     |   89 ++
 arm_compute/runtime/CL/CLLutAllocator.h            |   88 ++
 arm_compute/runtime/CL/CLMultiHOG.h                |   56 ++
 arm_compute/runtime/CL/CLMultiImage.h              |   87 ++
 arm_compute/runtime/CL/CLPyramid.h                 |   82 ++
 arm_compute/runtime/CL/CLScheduler.h               |  158 +++
 arm_compute/runtime/CL/CLSubTensor.h               |   99 ++
 arm_compute/runtime/CL/CLTensor.h                  |   81 ++
 arm_compute/runtime/CL/CLTensorAllocator.h         |  103 ++
 arm_compute/runtime/CL/ICLSimpleFunction.h         |   50 +
 .../runtime/CL/functions/CLAbsoluteDifference.h    |   50 +
 arm_compute/runtime/CL/functions/CLAccumulate.h    |   73 ++
 .../runtime/CL/functions/CLActivationLayer.h       |   51 +
 .../runtime/CL/functions/CLArithmeticAddition.h    |   52 +
 .../runtime/CL/functions/CLArithmeticSubtraction.h |   53 +
 .../CL/functions/CLBatchNormalizationLayer.h       |   67 ++
 arm_compute/runtime/CL/functions/CLBitwiseAnd.h    |   50 +
 arm_compute/runtime/CL/functions/CLBitwiseNot.h    |   49 +
 arm_compute/runtime/CL/functions/CLBitwiseOr.h     |   50 +
 arm_compute/runtime/CL/functions/CLBitwiseXor.h    |   50 +
 arm_compute/runtime/CL/functions/CLBox3x3.h        |   55 ++
 arm_compute/runtime/CL/functions/CLCannyEdge.h     |   85 ++
 .../runtime/CL/functions/CLChannelCombine.h        |   58 ++
 .../runtime/CL/functions/CLChannelExtract.h        |   56 ++
 arm_compute/runtime/CL/functions/CLColorConvert.h  |   68 ++
 arm_compute/runtime/CL/functions/CLConvolution.h   |  128 +++
 .../runtime/CL/functions/CLConvolutionLayer.h      |  121 +++
 .../runtime/CL/functions/CLDepthConcatenate.h      |   69 ++
 arm_compute/runtime/CL/functions/CLDepthConvert.h  |   60 ++
 arm_compute/runtime/CL/functions/CLDerivative.h    |   59 ++
 arm_compute/runtime/CL/functions/CLDilate.h        |   55 ++
 .../runtime/CL/functions/CLEqualizeHistogram.h     |   72 ++
 arm_compute/runtime/CL/functions/CLErode.h         |   55 ++
 arm_compute/runtime/CL/functions/CLFastCorners.h   |   88 ++
 arm_compute/runtime/CL/functions/CLFillBorder.h    |   49 +
 .../runtime/CL/functions/CLFullyConnectedLayer.h   |  120 +++
 arm_compute/runtime/CL/functions/CLGEMM.h          |   84 ++
 .../runtime/CL/functions/CLGEMMInterleave4x4.h     |   50 +
 arm_compute/runtime/CL/functions/CLGEMMLowp.h      |   85 ++
 arm_compute/runtime/CL/functions/CLGaussian3x3.h   |   55 ++
 arm_compute/runtime/CL/functions/CLGaussian5x5.h   |   70 ++
 .../runtime/CL/functions/CLGaussianPyramid.h       |  119 +++
 arm_compute/runtime/CL/functions/CLHOGDescriptor.h |   72 ++
 arm_compute/runtime/CL/functions/CLHOGDetector.h   |   78 ++
 arm_compute/runtime/CL/functions/CLHOGGradient.h   |   72 ++
 .../runtime/CL/functions/CLHOGMultiDetection.h     |  105 ++
 arm_compute/runtime/CL/functions/CLHarrisCorners.h |  104 ++
 arm_compute/runtime/CL/functions/CLHistogram.h     |   68 ++
 arm_compute/runtime/CL/functions/CLIntegralImage.h |   60 ++
 .../runtime/CL/functions/CLLaplacianPyramid.h      |   85 ++
 .../runtime/CL/functions/CLLaplacianReconstruct.h  |   91 ++
 .../runtime/CL/functions/CLLocallyConnectedLayer.h |   79 ++
 arm_compute/runtime/CL/functions/CLMagnitude.h     |   48 +
 arm_compute/runtime/CL/functions/CLMeanStdDev.h    |   56 ++
 arm_compute/runtime/CL/functions/CLMedian3x3.h     |   55 ++
 .../runtime/CL/functions/CLMinMaxLocation.h        |   86 ++
 .../runtime/CL/functions/CLNonLinearFilter.h       |   61 ++
 .../CL/functions/CLNonMaximaSuppression3x3.h       |   55 ++
 .../runtime/CL/functions/CLNormalizationLayer.h    |   71 ++
 arm_compute/runtime/CL/functions/CLOpticalFlow.h   |  111 +++
 arm_compute/runtime/CL/functions/CLPhase.h         |   48 +
 .../CL/functions/CLPixelWiseMultiplication.h       |   51 +
 arm_compute/runtime/CL/functions/CLPoolingLayer.h  |   52 +
 arm_compute/runtime/CL/functions/CLRemap.h         |   59 ++
 arm_compute/runtime/CL/functions/CLScale.h         |   52 +
 arm_compute/runtime/CL/functions/CLScharr3x3.h     |   58 ++
 arm_compute/runtime/CL/functions/CLSobel3x3.h      |   58 ++
 arm_compute/runtime/CL/functions/CLSobel5x5.h      |   74 ++
 arm_compute/runtime/CL/functions/CLSobel7x7.h      |   74 ++
 arm_compute/runtime/CL/functions/CLSoftmaxLayer.h  |   69 ++
 arm_compute/runtime/CL/functions/CLTableLookup.h   |   47 +
 arm_compute/runtime/CL/functions/CLThreshold.h     |   55 ++
 arm_compute/runtime/CL/functions/CLTranspose.h     |   50 +
 arm_compute/runtime/CL/functions/CLWarpAffine.h    |   52 +
 .../runtime/CL/functions/CLWarpPerspective.h       |   52 +
 arm_compute/runtime/CPP/CPPScheduler.h             |   73 ++
 arm_compute/runtime/Distribution1D.h               |   55 ++
 arm_compute/runtime/HOG.h                          |   56 ++
 arm_compute/runtime/IFunction.h                    |   54 ++
 arm_compute/runtime/ILutAllocator.h                |   84 ++
 arm_compute/runtime/IScheduler.h                   |   55 ++
 arm_compute/runtime/ITensorAllocator.h             |   93 ++
 arm_compute/runtime/Lut.h                          |   68 ++
 arm_compute/runtime/LutAllocator.h                 |   58 ++
 arm_compute/runtime/MultiHOG.h                     |   58 ++
 arm_compute/runtime/MultiImage.h                   |   96 ++
 arm_compute/runtime/NEON/INESimpleFunction.h       |   50 +
 arm_compute/runtime/NEON/NEFunctions.h             |   96 ++
 arm_compute/runtime/NEON/NEScheduler.h             |   33 +
 .../runtime/NEON/functions/NEAbsoluteDifference.h  |   50 +
 arm_compute/runtime/NEON/functions/NEAccumulate.h  |   74 ++
 .../runtime/NEON/functions/NEActivationLayer.h     |   51 +
 .../runtime/NEON/functions/NEArithmeticAddition.h  |   48 +
 .../NEON/functions/NEArithmeticSubtraction.h       |   48 +
 .../NEON/functions/NEBatchNormalizationLayer.h     |   66 ++
 arm_compute/runtime/NEON/functions/NEBitwiseAnd.h  |   46 +
 arm_compute/runtime/NEON/functions/NEBitwiseNot.h  |   45 +
 arm_compute/runtime/NEON/functions/NEBitwiseOr.h   |   46 +
 arm_compute/runtime/NEON/functions/NEBitwiseXor.h  |   46 +
 arm_compute/runtime/NEON/functions/NEBox3x3.h      |   58 ++
 arm_compute/runtime/NEON/functions/NECannyEdge.h   |   97 ++
 .../runtime/NEON/functions/NEChannelCombine.h      |   58 ++
 .../runtime/NEON/functions/NEChannelExtract.h      |   56 ++
 .../runtime/NEON/functions/NEColorConvert.h        |   65 ++
 arm_compute/runtime/NEON/functions/NEConvolution.h |  128 +++
 .../runtime/NEON/functions/NEConvolutionLayer.h    |  115 +++
 .../runtime/NEON/functions/NEDepthConcatenate.h    |   66 ++
 .../runtime/NEON/functions/NEDepthConvert.h        |   67 ++
 arm_compute/runtime/NEON/functions/NEDerivative.h  |   70 ++
 arm_compute/runtime/NEON/functions/NEDilate.h      |   55 ++
 .../NEON/functions/NEDirectConvolutionLayer.h      |   72 ++
 .../runtime/NEON/functions/NEEqualizeHistogram.h   |   77 ++
 arm_compute/runtime/NEON/functions/NEErode.h       |   55 ++
 arm_compute/runtime/NEON/functions/NEFastCorners.h |   80 ++
 arm_compute/runtime/NEON/functions/NEFillBorder.h  |   58 ++
 .../runtime/NEON/functions/NEFullyConnectedLayer.h |  119 +++
 arm_compute/runtime/NEON/functions/NEGEMM.h        |   78 ++
 .../runtime/NEON/functions/NEGEMMInterleave4x4.h   |   49 +
 arm_compute/runtime/NEON/functions/NEGEMMLowp.h    |   85 ++
 .../runtime/NEON/functions/NEGEMMTranspose1xW.h    |   47 +
 arm_compute/runtime/NEON/functions/NEGaussian3x3.h |   55 ++
 arm_compute/runtime/NEON/functions/NEGaussian5x5.h |   71 ++
 .../runtime/NEON/functions/NEGaussianPyramid.h     |  122 +++
 .../runtime/NEON/functions/NEHOGDescriptor.h       |   71 ++
 arm_compute/runtime/NEON/functions/NEHOGDetector.h |   57 ++
 arm_compute/runtime/NEON/functions/NEHOGGradient.h |   72 ++
 .../runtime/NEON/functions/NEHOGMultiDetection.h   |  105 ++
 .../runtime/NEON/functions/NEHarrisCorners.h       |  103 ++
 arm_compute/runtime/NEON/functions/NEHistogram.h   |   63 ++
 .../runtime/NEON/functions/NEIntegralImage.h       |   45 +
 .../runtime/NEON/functions/NELaplacianPyramid.h    |   85 ++
 .../NEON/functions/NELaplacianReconstruct.h        |   91 ++
 .../NEON/functions/NELocallyConnectedLayer.h       |   79 ++
 arm_compute/runtime/NEON/functions/NEMagnitude.h   |   47 +
 arm_compute/runtime/NEON/functions/NEMeanStdDev.h  |   62 ++
 arm_compute/runtime/NEON/functions/NEMedian3x3.h   |   56 ++
 .../runtime/NEON/functions/NEMinMaxLocation.h      |   71 ++
 .../runtime/NEON/functions/NENonLinearFilter.h     |   61 ++
 .../NEON/functions/NENonMaximaSuppression3x3.h     |   56 ++
 .../runtime/NEON/functions/NENormalizationLayer.h  |   71 ++
 arm_compute/runtime/NEON/functions/NEOpticalFlow.h |   95 ++
 arm_compute/runtime/NEON/functions/NEPhase.h       |   46 +
 .../NEON/functions/NEPixelWiseMultiplication.h     |   50 +
 .../runtime/NEON/functions/NEPoolingLayer.h        |   52 +
 arm_compute/runtime/NEON/functions/NERemap.h       |   60 ++
 arm_compute/runtime/NEON/functions/NEScale.h       |   62 ++
 arm_compute/runtime/NEON/functions/NEScharr3x3.h   |   59 ++
 arm_compute/runtime/NEON/functions/NESobel3x3.h    |   59 ++
 arm_compute/runtime/NEON/functions/NESobel5x5.h    |   75 ++
 arm_compute/runtime/NEON/functions/NESobel7x7.h    |   75 ++
 .../runtime/NEON/functions/NESoftmaxLayer.h        |   71 ++
 arm_compute/runtime/NEON/functions/NETableLookup.h |   47 +
 arm_compute/runtime/NEON/functions/NEThreshold.h   |   54 ++
 arm_compute/runtime/NEON/functions/NETranspose.h   |   51 +
 arm_compute/runtime/NEON/functions/NEWarpAffine.h  |   52 +
 .../runtime/NEON/functions/NEWarpPerspective.h     |   52 +
 arm_compute/runtime/OMP/OMPScheduler.h             |   68 ++
 arm_compute/runtime/Pyramid.h                      |   76 ++
 arm_compute/runtime/Scheduler.h                    |   77 ++
 arm_compute/runtime/SingleThreadScheduler.h        |   62 ++
 arm_compute/runtime/SubTensor.h                    |   73 ++
 arm_compute/runtime/Tensor.h                       |   65 ++
 arm_compute/runtime/TensorAllocator.h              |   90 ++
 arm_compute/runtime/Utils.h                        |   41 +
 366 files changed, 34736 insertions(+)
 create mode 100644 arm_compute/core/AccessWindowAutoPadding.h
 create mode 100644 arm_compute/core/AccessWindowStatic.h
 create mode 100644 arm_compute/core/AccessWindowTranspose.h
 create mode 100644 arm_compute/core/CL/CLHelpers.h
 create mode 100644 arm_compute/core/CL/CLKernelLibrary.h
 create mode 100644 arm_compute/core/CL/CLKernels.h
 create mode 100644 arm_compute/core/CL/CLTypes.h
 create mode 100644 arm_compute/core/CL/ICLArray.h
 create mode 100644 arm_compute/core/CL/ICLDistribution1D.h
 create mode 100644 arm_compute/core/CL/ICLHOG.h
 create mode 100644 arm_compute/core/CL/ICLKernel.h
 create mode 100644 arm_compute/core/CL/ICLLut.h
 create mode 100644 arm_compute/core/CL/ICLMultiHOG.h
 create mode 100644 arm_compute/core/CL/ICLMultiImage.h
 create mode 100644 arm_compute/core/CL/ICLSimple2DKernel.h
 create mode 100644 arm_compute/core/CL/ICLSimple3DKernel.h
 create mode 100644 arm_compute/core/CL/ICLSimpleKernel.h
 create mode 100644 arm_compute/core/CL/ICLTensor.h
 create mode 100644 arm_compute/core/CL/OpenCL.h
 create mode 100644 arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLAccumulateKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLActivationLayerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLBox3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLChannelCombineKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLChannelExtractKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLCol2ImKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLColorConvertKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLConvolutionKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLDepthConvertKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLDerivativeKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLDilateKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLErodeKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLFastCornersKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLFillBorderKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLHistogramKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLIm2ColKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLIntegralImageKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLLKTrackerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLRemapKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLScaleKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLTableLookupKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLThresholdKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLTransposeKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLWarpAffineKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
 create mode 100644 arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
 create mode 100644 arm_compute/core/CPP/CPPKernels.h
 create mode 100644 arm_compute/core/CPP/ICPPKernel.h
 create mode 100644 arm_compute/core/CPP/ICPPSimpleKernel.h
 create mode 100644 arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
 create mode 100644 arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
 create mode 100644 arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
 create mode 100644 arm_compute/core/Coordinates.h
 create mode 100644 arm_compute/core/Dimensions.h
 create mode 100644 arm_compute/core/Error.h
 create mode 100644 arm_compute/core/FixedPoint.h
 create mode 100644 arm_compute/core/FixedPoint.inl
 create mode 100644 arm_compute/core/HOGInfo.h
 create mode 100644 arm_compute/core/Helpers.h
 create mode 100644 arm_compute/core/Helpers.inl
 create mode 100644 arm_compute/core/IAccessWindow.h
 create mode 100644 arm_compute/core/IArray.h
 create mode 100644 arm_compute/core/IDistribution.h
 create mode 100644 arm_compute/core/IDistribution1D.h
 create mode 100644 arm_compute/core/IHOG.h
 create mode 100644 arm_compute/core/IKernel.h
 create mode 100644 arm_compute/core/ILut.h
 create mode 100644 arm_compute/core/IMultiHOG.h
 create mode 100644 arm_compute/core/IMultiImage.h
 create mode 100644 arm_compute/core/IPyramid.h
 create mode 100644 arm_compute/core/ITensor.h
 create mode 100644 arm_compute/core/ITensorInfo.h
 create mode 100644 arm_compute/core/MultiImageInfo.h
 create mode 100644 arm_compute/core/NEON/INEKernel.h
 create mode 100644 arm_compute/core/NEON/INESimpleKernel.h
 create mode 100644 arm_compute/core/NEON/NEColorConvertHelper.inl
 create mode 100644 arm_compute/core/NEON/NEFixedPoint.h
 create mode 100644 arm_compute/core/NEON/NEFixedPoint.inl
 create mode 100644 arm_compute/core/NEON/NEKernels.h
 create mode 100644 arm_compute/core/NEON/NEMath.h
 create mode 100644 arm_compute/core/NEON/NEMath.inl
 create mode 100644 arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEAccumulateKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NECol2ImKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEColorConvertKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEConvolutionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDerivativeKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDilateKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEErodeKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEFastCornersKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEFillArrayKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEFillBorderKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEHistogramKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEIm2ColKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NELKTrackerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NERemapKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEScaleKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NETableLookupKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEThresholdKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NETransposeKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEWarpKernel.h
 create mode 100644 arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
 create mode 100644 arm_compute/core/PixelValue.h
 create mode 100644 arm_compute/core/PyramidInfo.h
 create mode 100644 arm_compute/core/Size2D.h
 create mode 100644 arm_compute/core/Steps.h
 create mode 100644 arm_compute/core/Strides.h
 create mode 100644 arm_compute/core/SubTensorInfo.h
 create mode 100644 arm_compute/core/TensorInfo.h
 create mode 100644 arm_compute/core/TensorShape.h
 create mode 100644 arm_compute/core/Types.h
 create mode 100644 arm_compute/core/Utils.h
 create mode 100644 arm_compute/core/Validate.h
 create mode 100644 arm_compute/core/Window.h
 create mode 100644 arm_compute/core/Window.inl
 create mode 100644 arm_compute/runtime/Array.h
 create mode 100644 arm_compute/runtime/CL/CLArray.h
 create mode 100644 arm_compute/runtime/CL/CLDistribution1D.h
 create mode 100644 arm_compute/runtime/CL/CLFunctions.h
 create mode 100644 arm_compute/runtime/CL/CLHOG.h
 create mode 100644 arm_compute/runtime/CL/CLLut.h
 create mode 100644 arm_compute/runtime/CL/CLLutAllocator.h
 create mode 100644 arm_compute/runtime/CL/CLMultiHOG.h
 create mode 100644 arm_compute/runtime/CL/CLMultiImage.h
 create mode 100644 arm_compute/runtime/CL/CLPyramid.h
 create mode 100644 arm_compute/runtime/CL/CLScheduler.h
 create mode 100644 arm_compute/runtime/CL/CLSubTensor.h
 create mode 100644 arm_compute/runtime/CL/CLTensor.h
 create mode 100644 arm_compute/runtime/CL/CLTensorAllocator.h
 create mode 100644 arm_compute/runtime/CL/ICLSimpleFunction.h
 create mode 100644 arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
 create mode 100644 arm_compute/runtime/CL/functions/CLAccumulate.h
 create mode 100644 arm_compute/runtime/CL/functions/CLActivationLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLArithmeticAddition.h
 create mode 100644 arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBitwiseAnd.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBitwiseNot.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBitwiseOr.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBitwiseXor.h
 create mode 100644 arm_compute/runtime/CL/functions/CLBox3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLCannyEdge.h
 create mode 100644 arm_compute/runtime/CL/functions/CLChannelCombine.h
 create mode 100644 arm_compute/runtime/CL/functions/CLChannelExtract.h
 create mode 100644 arm_compute/runtime/CL/functions/CLColorConvert.h
 create mode 100644 arm_compute/runtime/CL/functions/CLConvolution.h
 create mode 100644 arm_compute/runtime/CL/functions/CLConvolutionLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLDepthConcatenate.h
 create mode 100644 arm_compute/runtime/CL/functions/CLDepthConvert.h
 create mode 100644 arm_compute/runtime/CL/functions/CLDerivative.h
 create mode 100644 arm_compute/runtime/CL/functions/CLDilate.h
 create mode 100644 arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
 create mode 100644 arm_compute/runtime/CL/functions/CLErode.h
 create mode 100644 arm_compute/runtime/CL/functions/CLFastCorners.h
 create mode 100644 arm_compute/runtime/CL/functions/CLFillBorder.h
 create mode 100644 arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGEMM.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGEMMLowp.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGaussian3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGaussian5x5.h
 create mode 100644 arm_compute/runtime/CL/functions/CLGaussianPyramid.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHOGDescriptor.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHOGDetector.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHOGGradient.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHarrisCorners.h
 create mode 100644 arm_compute/runtime/CL/functions/CLHistogram.h
 create mode 100644 arm_compute/runtime/CL/functions/CLIntegralImage.h
 create mode 100644 arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
 create mode 100644 arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
 create mode 100644 arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLMagnitude.h
 create mode 100644 arm_compute/runtime/CL/functions/CLMeanStdDev.h
 create mode 100644 arm_compute/runtime/CL/functions/CLMedian3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLMinMaxLocation.h
 create mode 100644 arm_compute/runtime/CL/functions/CLNonLinearFilter.h
 create mode 100644 arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLNormalizationLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLOpticalFlow.h
 create mode 100644 arm_compute/runtime/CL/functions/CLPhase.h
 create mode 100644 arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
 create mode 100644 arm_compute/runtime/CL/functions/CLPoolingLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLRemap.h
 create mode 100644 arm_compute/runtime/CL/functions/CLScale.h
 create mode 100644 arm_compute/runtime/CL/functions/CLScharr3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLSobel3x3.h
 create mode 100644 arm_compute/runtime/CL/functions/CLSobel5x5.h
 create mode 100644 arm_compute/runtime/CL/functions/CLSobel7x7.h
 create mode 100644 arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
 create mode 100644 arm_compute/runtime/CL/functions/CLTableLookup.h
 create mode 100644 arm_compute/runtime/CL/functions/CLThreshold.h
 create mode 100644 arm_compute/runtime/CL/functions/CLTranspose.h
 create mode 100644 arm_compute/runtime/CL/functions/CLWarpAffine.h
 create mode 100644 arm_compute/runtime/CL/functions/CLWarpPerspective.h
 create mode 100644 arm_compute/runtime/CPP/CPPScheduler.h
 create mode 100644 arm_compute/runtime/Distribution1D.h
 create mode 100644 arm_compute/runtime/HOG.h
 create mode 100644 arm_compute/runtime/IFunction.h
 create mode 100644 arm_compute/runtime/ILutAllocator.h
 create mode 100644 arm_compute/runtime/IScheduler.h
 create mode 100644 arm_compute/runtime/ITensorAllocator.h
 create mode 100644 arm_compute/runtime/Lut.h
 create mode 100644 arm_compute/runtime/LutAllocator.h
 create mode 100644 arm_compute/runtime/MultiHOG.h
 create mode 100644 arm_compute/runtime/MultiImage.h
 create mode 100644 arm_compute/runtime/NEON/INESimpleFunction.h
 create mode 100644 arm_compute/runtime/NEON/NEFunctions.h
 create mode 100644 arm_compute/runtime/NEON/NEScheduler.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEAccumulate.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEActivationLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBitwiseNot.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBitwiseOr.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBitwiseXor.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEBox3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NECannyEdge.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEChannelCombine.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEChannelExtract.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEColorConvert.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEConvolution.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEDepthConvert.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEDerivative.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEDilate.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEErode.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEFastCorners.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEFillBorder.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGEMM.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMLowp.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGaussian3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGaussian5x5.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHOGDetector.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHOGGradient.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHarrisCorners.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEHistogram.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEIntegralImage.h
 create mode 100644 arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
 create mode 100644 arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
 create mode 100644 arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEMagnitude.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEMeanStdDev.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEMedian3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
 create mode 100644 arm_compute/runtime/NEON/functions/NENonLinearFilter.h
 create mode 100644 arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NENormalizationLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEOpticalFlow.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEPhase.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEPoolingLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NERemap.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEScale.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEScharr3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NESobel3x3.h
 create mode 100644 arm_compute/runtime/NEON/functions/NESobel5x5.h
 create mode 100644 arm_compute/runtime/NEON/functions/NESobel7x7.h
 create mode 100644 arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
 create mode 100644 arm_compute/runtime/NEON/functions/NETableLookup.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEThreshold.h
 create mode 100644 arm_compute/runtime/NEON/functions/NETranspose.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEWarpAffine.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEWarpPerspective.h
 create mode 100644 arm_compute/runtime/OMP/OMPScheduler.h
 create mode 100644 arm_compute/runtime/Pyramid.h
 create mode 100644 arm_compute/runtime/Scheduler.h
 create mode 100644 arm_compute/runtime/SingleThreadScheduler.h
 create mode 100644 arm_compute/runtime/SubTensor.h
 create mode 100644 arm_compute/runtime/Tensor.h
 create mode 100644 arm_compute/runtime/TensorAllocator.h
 create mode 100644 arm_compute/runtime/Utils.h

(limited to 'arm_compute')

diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
new file mode 100644
index 0000000000..0a3344b115
--- /dev/null
+++ b/arm_compute/core/AccessWindowAutoPadding.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
+#define __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Dummy access window.
+ *
+ * This implementation always uses the auto padding of the tensor info and
+ * never updates the window. The valid region is always set to cover the entire
+ * tensor.
+ *
+ * @note This access window is only used during the migration to the new
+ *       padding system. It will be removed once all kernels have been ported.
+ *
+ * */
+class AccessWindowAutoPadding : public IAccessWindow
+{
+public:
+    /** Default constructor.
+     *
+     * @param[in,out] info Tensor info of the accessed kernel.
+     */
+    AccessWindowAutoPadding(ITensorInfo *info);
+    AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
+    AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
+    AccessWindowAutoPadding(AccessWindowAutoPadding &&)                 = default;
+    AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default;
+    ~AccessWindowAutoPadding()                                     = default;
+
+    /** Set the valid region to match the entire tensor. */
+    void set_valid_region();
+
+    /** Return a valid region that spans across the entire tensor. */
+    ValidRegion compute_valid_region() const;
+
+    // Inherited methods overridden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+private:
+    ITensorInfo *_info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__*/
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
new file mode 100644
index 0000000000..6dcba072c4
--- /dev/null
+++ b/arm_compute/core/AccessWindowStatic.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a static rectangular access pattern.
+ *
+ * In this implementation the access offsets and sizes are not relative to the
+ * current element. Instead they are considered to be absolute coordinates
+ * within the accessed tensor's shape.
+ *
+ * */
+class AccessWindowStatic : public IAccessWindow
+{
+public:
+    /** Constructor for a static access pattern.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     start_x Start of the access in X direction.
+     * @param[in]     start_y Start of the access in Y direction.
+     * @param[in]     end_x   End of the access in X direction.
+     * @param[in]     end_y   End of the access in Y direction.
+     */
+    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
+
+    AccessWindowStatic(const AccessWindowStatic &) = delete;
+    AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
+    AccessWindowStatic(AccessWindowStatic &&)                 = default;
+    AccessWindowStatic &operator=(AccessWindowStatic &&) = default;
+    ~AccessWindowStatic()                                = default;
+
+    /** Set the valid region based on the static access pattern and valid
+     *  region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
+
+    /** Compute the valid region based on the static access pattern and valid region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
+
+    // Inherited methods overriden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+    ITensorInfo *_info;
+    int          _start_x;
+    int          _start_y;
+    int          _end_x;
+    int          _end_y;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__*/
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
new file mode 100644
index 0000000000..102860f9d8
--- /dev/null
+++ b/arm_compute/core/AccessWindowTranspose.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a XY-transpose access pattern. */
+class AccessWindowTranspose : public AccessWindowRectangle
+{
+public:
+    using AccessWindowRectangle::AccessWindowRectangle;
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    using AccessWindowRectangle::compute_valid_region;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__*/
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
new file mode 100644
index 0000000000..26253e3f38
--- /dev/null
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHELPERS_H__
+#define __ARM_COMPUTE_CLHELPERS_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <string>
+
+namespace arm_compute
+{
+enum class DataType;
+enum class GPUTarget;
+
+/** Enable operation operations on GPUTarget enumerations */
+template <>
+struct enable_bitwise_ops<arm_compute::GPUTarget>
+{
+    static constexpr bool value = true;
+};
+
+/** Max vector width of an OpenCL vector */
+static constexpr const unsigned int max_cl_vector_width = 16;
+
+/** Translates a tensor data type to the appropriate OpenCL type.
+ *
+ * @param[in] dt @ref DataType to be translated to OpenCL type.
+ *
+ * @return The string specifying the OpenCL type to be used.
+ */
+std::string get_cl_type_from_data_type(const DataType &dt);
+
+/** Translates a given gpu device target to string.
+ *
+ * @param[in] target Given gpu target.
+ *
+ * @return The string describing the target.
+ */
+const std::string &string_from_target(GPUTarget target);
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *  It also calls the kernel's configuration.
+ *
+ * @param[in] args All the arguments that need pass to kernel's configuration.
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel, typename... T>
+std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    k->configure(std::forward<T>(args)...);
+    return k;
+}
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel>
+std::unique_ptr<Kernel> create_kernel()
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    return k;
+}
+
+/** Helper function to get the GPU target from CL device
+ *
+ * @param[in] device A CL device
+ *
+ * @return the GPU target
+ */
+GPUTarget get_target_from_device(cl::Device &device);
+
+/** Helper function to get the GPU arch
+ *
+ * @param[in] target GPU target
+ *
+ * @return the GPU target which shows the arch
+ */
+GPUTarget get_arch_from_target(GPUTarget target);
+}
+#endif
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
new file mode 100644
index 0000000000..c29610c252
--- /dev/null
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+/** Program class */
+class Program
+{
+public:
+    /** Default constructor. */
+    Program();
+    /** Construct program from source file.
+     *
+     * @param[in] context CL context used to create the program.
+     * @param[in] name    Program name.
+     * @param[in] source  Program source.
+     */
+    Program(cl::Context context, std::string name, std::string source);
+    /** Construct program from binary file.
+     *
+     * @param[in] context CL context used to create the program.
+     * @param[in] device  CL device for which the programs are created.
+     * @param[in] name    Program name.
+     * @param[in] binary  Program binary.
+     */
+    Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary);
+    /** Default Copy Constructor. */
+    Program(const Program &) = default;
+    /** Default Move Constructor. */
+    Program(Program &&) = default;
+    /** Default copy assignment operator. */
+    Program &operator=(const Program &) = default;
+    /** Default move assignment operator. */
+    Program &operator=(Program &&) = default;
+    /**Returns program name.
+     *
+     * @return Program's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** User-defined conversion to the underlying CL program.
+     *
+     * @return The CL program object.
+     */
+    explicit operator cl::Program() const;
+
+    static bool build(const cl::Program &program, const std::string &build_options = "");
+    /** Build the underlying CL program.
+     *
+     * @param[in] build_options Options used to build the CL program.
+     *
+     * @return A reference to itself.
+     */
+    cl::Program build(const std::string &build_options = "") const;
+
+private:
+    cl::Context                _context;   /**< Underlying CL context. */
+    cl::Device                 _device;    /**< CL device for which the programs are created. */
+    bool                       _is_binary; /**< Create program from binary? */
+    std::string                _name;      /**< Program name. */
+    std::string                _source;    /**< Source code for the program. */
+    std::vector<unsigned char> _binary;    /**< Binary from which to create the program. */
+};
+
+/** Kernel class */
+class Kernel
+{
+public:
+    /** Default Constructor. */
+    Kernel();
+    /** Default Copy Constructor. */
+    Kernel(const Kernel &) = default;
+    /** Default Move Constructor. */
+    Kernel(Kernel &&) = default;
+    /** Default copy assignment operator. */
+    Kernel &operator=(const Kernel &) = default;
+    /** Default move assignment operator. */
+    Kernel &operator=(Kernel &&) = default;
+    /** Constructor.
+     *
+     * @param[in] name    Kernel name.
+     * @param[in] program Built program.
+     */
+    Kernel(std::string name, const cl::Program &program);
+    /** Returns kernel name.
+     *
+     * @return Kernel's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Returns OpenCL kernel.
+     *
+     * @return OpenCL Kernel.
+     */
+    explicit operator cl::Kernel() const
+    {
+        return _kernel;
+    }
+
+private:
+    std::string _name;   /**< Kernel name */
+    cl::Kernel  _kernel; /**< OpenCL Kernel */
+};
+
+/** CLKernelLibrary class */
+class CLKernelLibrary
+{
+    using StringSet = std::set<std::string>;
+
+private:
+    /** Default Constructor. */
+    CLKernelLibrary();
+
+public:
+    /** Prevent instances of this class from being copied. */
+    CLKernelLibrary(const CLKernelLibrary &) = delete;
+    /** Prevent instances of this class from being copied. */
+    const CLKernelLibrary &operator=(const CLKernelLibrary &) = delete;
+    /** Access the KernelLibrary singleton.
+     * @return The KernelLibrary instance.
+     */
+    static CLKernelLibrary &get();
+    /** Initialises the kernel library.
+     *
+     * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
+     * @param[in] context     (Optional) CL context used to create programs.
+     * @param[in] device      (Optional) CL device for which the programs are created.
+     */
+    void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault())
+    {
+        _kernel_path = std::move(kernel_path);
+        _context     = std::move(context);
+        _device      = std::move(device);
+    }
+    /** Sets the path that the kernels reside in.
+     *
+     * @param[in] kernel_path Path of the kernel.
+     */
+    void set_kernel_path(const std::string &kernel_path)
+    {
+        _kernel_path = kernel_path;
+    };
+    /** Sets the CL context used to create programs.
+     *
+     * @note Setting the context also resets the device to the
+     *       first one available in the new context.
+     *
+     * @param[in] context A CL context.
+     */
+    void set_context(cl::Context context)
+    {
+        _context = std::move(context);
+
+        const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+        if(cl_devices.empty())
+        {
+            _device = cl::Device();
+        }
+        else
+        {
+            _device = cl_devices[0];
+        }
+    };
+    /** Sets the CL device for which the programs are created.
+     *
+     * @param[in] device A CL device.
+     */
+    void set_device(cl::Device device)
+    {
+        _device = std::move(device);
+    };
+    /** Creates a kernel from the kernel library.
+     *
+     * @param[in] kernel_name       Kernel name.
+     * @param[in] build_options_set Kernel build options as a set.
+     *
+     * @return The created kernel.
+     */
+    Kernel create_kernel(const std::string &kernel_name, const StringSet &build_options_set = {}) const;
+    /** Serializes and saves programs to a binary.
+     *
+     */
+    void save_binary();
+    /** Load serialized binary with all the programs.
+     *
+     */
+    void load_binary();
+
+private:
+    /** Load program and its dependencies.
+     *
+     * @param[in] program_name Name of the program to load.
+     */
+    const Program &load_program(const std::string &program_name) const;
+    /** Concatenates contents of a set into a single string.
+     *
+     * @param[in] s Input set to concatenate.
+     *
+     * @return Concatenated string.
+     */
+    std::string stringify_set(const StringSet &s) const;
+
+    cl::Context _context;                                                /**< Underlying CL context. */
+    cl::Device  _device;                                                 /**< Underlying CL device. */
+    std::string _kernel_path;                                            /**< Path to the kernels folder. */
+    mutable std::map<std::string, const Program>    _programs_map;       /**< Map with all already loaded program data. */
+    mutable std::map<std::string, cl::Program>      _built_programs_map; /**< Map with all already built program data. */
+    static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
+                                                                              Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_H__ */
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
new file mode 100644
index 0000000000..0e9f356e52
--- /dev/null
+++ b/arm_compute/core/CL/CLKernels.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELS_H__
+#define __ARM_COMPUTE_CLKERNELS_H__
+
+/* Header regrouping all the CL kernels */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
new file mode 100644
index 0000000000..c5643d8939
--- /dev/null
+++ b/arm_compute/core/CL/CLTypes.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CL_TYPES_H__
+#define __ARM_COMPUTE_CL_TYPES_H__
+
+namespace arm_compute
+{
+/** Available GPU Targets */
+enum class GPUTarget
+{
+    GPU_ARCH_MASK = 0xF00,
+    MIDGARD       = 0x100,
+    BIFROST       = 0x200,
+    T600          = 0x110,
+    T700          = 0x120,
+    T800          = 0x130,
+    G70           = 0x210
+};
+}
+#endif /* __ARM_COMPUTE_CL_TYPES_H__ */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
new file mode 100644
index 0000000000..1b676ed5a3
--- /dev/null
+++ b/arm_compute/core/CL/ICLArray.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLARRAY_H__
+#define __ARM_COMPUTE_ICLARRAY_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/ITensor.h"
+
+namespace arm_compute
+{
+/** Interface for OpenCL Array */
+template <class T>
+class ICLArray : public IArray<T>
+{
+public:
+    /* Constructor */
+    explicit ICLArray(size_t max_num_values)
+        : IArray<T>(max_num_values), _mapping(nullptr)
+    {
+    }
+
+    ICLArray(const ICLArray &) = delete;
+    ICLArray &operator=(const ICLArray &) = delete;
+    virtual ~ICLArray()                   = default;
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the array's data.
+     *
+     * @return A reference to an OpenCL buffer containing the array's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true)
+    {
+        _mapping = do_map(q, blocking);
+    }
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q)
+    {
+        do_unmap(q, _mapping);
+        _mapping = nullptr;
+    }
+
+    // Inherited methods overridden:
+    T *buffer() const override
+    {
+        return reinterpret_cast<T *>(_mapping);
+    }
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping Pointer to the buffer to be unmapped.
+     */
+    virtual void do_unmap(cl::CommandQueue &q, uint8_t *mapping) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+
+using ICLKeyPointArray        = ICLArray<KeyPoint>;
+using ICLCoordinates2DArray   = ICLArray<Coordinates2D>;
+using ICLDetectionWindowArray = ICLArray<DetectionWindow>;
+using ICLSize2DArray          = ICLArray<Size2D>;
+using ICLUInt8Array           = ICLArray<cl_uchar>;
+using ICLUInt16Array          = ICLArray<cl_ushort>;
+using ICLUInt32Array          = ICLArray<cl_uint>;
+using ICLInt16Array           = ICLArray<cl_short>;
+using ICLInt32Array           = ICLArray<cl_int>;
+using ICLFloatArray           = ICLArray<cl_float>;
+}
+#endif /*__ARM_COMPUTE_ICLARRAY_H__*/
diff --git a/arm_compute/core/CL/ICLDistribution1D.h b/arm_compute/core/CL/ICLDistribution1D.h
new file mode 100644
index 0000000000..8fbbbbf548
--- /dev/null
+++ b/arm_compute/core/CL/ICLDistribution1D.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_ICLDISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution1D.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** ICLDistribution1D interface class */
+class ICLDistribution1D : public IDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLDistribution1D(const ICLDistribution1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    const ICLDistribution1D &operator=(const ICLDistribution1D &) = delete;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the distribution's data.
+     *
+     * @return A reference to an OpenCL buffer containing the distribution's data.
+     */
+    virtual cl::Buffer &cl_buffer() = 0;
+    // Inherited methods overridden:
+    uint32_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint32_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+protected:
+    uint32_t *_mapping; /**< The distribution data. */
+};
+}
+#endif /* __ARM_COMPUTE_ICLDISTRIBUTION1D_H__ */
diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
new file mode 100644
index 0000000000..a3d2fb4a57
--- /dev/null
+++ b/arm_compute/core/CL/ICLHOG.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLHOG_H__
+#define __ARM_COMPUTE_ICLHOG_H__
+
+#include "arm_compute/core/IHOG.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL HOG data-object */
+class ICLHOG : public IHOG
+{
+public:
+    /** Default constructor */
+    ICLHOG();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG(const ICLHOG &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG &operator=(const ICLHOG &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLHOG(ICLHOG &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLHOG &operator=(ICLHOG &&) = default;
+    /** Default destructor */
+    virtual ~ICLHOG() = default;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the hog's descriptor
+     *
+     * @return A reference to an OpenCL buffer containing the hog's descriptor
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+
+    /** Interface to be implemented by the child class to free the allocated cl buffer.
+     *
+     * @warning The buffer must have been allocated previously. Otherwise calling the function will fail.
+     */
+    virtual void free() = 0;
+
+    // Inherited methods overridden:
+    float *descriptor() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+}
+#endif /*__ARM_COMPUTE_ICLHOG_H__ */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
new file mode 100644
index 0000000000..72c963d11b
--- /dev/null
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLKERNEL_H__
+#define __ARM_COMPUTE_ICLKERNEL_H__
+
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class Window;
+
+/** Common interface for all the OpenCL kernels */
+class ICLKernel : public IKernel
+{
+public:
+    /** Constructor */
+    ICLKernel();
+    /** Returns a reference to the OpenCL kernel of this object.
+     *
+     * @return A reference to the OpenCL kernel of this object.
+     */
+    cl::Kernel &kernel();
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per 1D tensor object.
+     *
+     * @return The number of arguments enqueues per 1D tensor object.
+     */
+    unsigned int num_arguments_per_1D_tensor() const;
+    /** Returns the number of arguments enqueued per 2D tensor object.
+     *
+     * @return The number of arguments enqueues per 2D tensor object.
+     */
+    unsigned int num_arguments_per_2D_tensor() const;
+    /** Returns the number of arguments enqueued per 3D tensor object.
+     *
+     * @return The number of arguments enqueues per 3D tensor object.
+     */
+    unsigned int num_arguments_per_3D_tensor() const;
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+    /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     value Value to set as an argument of the object's kernel.
+     */
+    template <typename T>
+    void add_argument(unsigned int &idx, T value)
+    {
+        _kernel.setArg(idx++, value);
+    }
+
+    /** Set the targeted GPU architecture
+     *
+     * @param[in] target The targeted GPU architecture
+     */
+    void set_target(GPUTarget target);
+
+    /** Set the targeted GPU architecture according to the CL device
+     *
+     * @param[in] device A CL device
+     */
+    void set_target(cl::Device &device);
+
+    /** Get the targeted GPU architecture
+     *
+     * @return The targeted GPU architecture.
+     */
+    GPUTarget get_target() const;
+
+private:
+    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    template <unsigned int dimension_size>
+    void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per tensor object.
+     *
+     * @return The number of arguments enqueued per tensor object.
+     */
+    template <unsigned int dimension_size>
+    unsigned int           num_arguments_per_tensor() const;
+
+protected:
+    cl::Kernel  _kernel;   /**< OpenCL kernel to run */
+    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+    GPUTarget   _target;   /**< The targeted GPU */
+};
+
+/** Add the kernel to the command queue with the given window.
+ *
+ * @note Depending on the size of the window, this might translate into several jobs being enqueued.
+ *
+ * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
+ *
+ * @param[in,out] queue    OpenCL command queue.
+ * @param[in]     kernel   Kernel to enqueue
+ * @param[in]     window   Window the kernel has to process.
+ * @param[in]     lws_hint Local workgroup size requested, by default (128,1)
+ *
+ * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
+ */
+void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = cl::Range_128_1);
+}
+#endif /*__ARM_COMPUTE_ICLKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLLut.h b/arm_compute/core/CL/ICLLut.h
new file mode 100644
index 0000000000..2016ebb5c3
--- /dev/null
+++ b/arm_compute/core/CL/ICLLut.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLLUT_H__
+#define __ARM_COMPUTE_ICLLUT_H__
+
+#include "arm_compute/core/ILut.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL LUT */
+class ICLLut : public ILut
+{
+public:
+    ICLLut();
+    ICLLut(const ICLLut &) = delete;
+    ICLLut &operator=(const ICLLut &) = delete;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the lut's data.
+     *
+     * @return A reference to an OpenCL buffer containing the lut's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+
+    // Inherited methods overridden:
+    uint8_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+}
+#endif /*__ARM_COMPUTE_ICLLUT_H__ */
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
new file mode 100644
index 0000000000..9f3c775230
--- /dev/null
+++ b/arm_compute/core/CL/ICLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLMULTIHOG_H__
+#define __ARM_COMPUTE_ICLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+namespace arm_compute
+{
+/** Interface for storing multiple HOG data-objects */
+class ICLMultiHOG : public IMultiHOG
+{
+public:
+    /** Return a pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A pointer pointed to the HOG model
+     */
+    virtual ICLHOG *cl_model(size_t index) = 0;
+    /** Return a constant pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A constant pointer pointed to the OpenCL HOG model
+     */
+    virtual const ICLHOG *cl_model(size_t index) const = 0;
+
+    // Inherited methods overridden:
+    IHOG *model(size_t index) override;
+    const IHOG *model(size_t index) const override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLMULTIHOG_H__ */
diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
new file mode 100644
index 0000000000..e8705b1824
--- /dev/null
+++ b/arm_compute/core/CL/ICLMultiImage.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLMULTIIMAGE_H__
+#define __ARM_COMPUTE_ICLMULTIIMAGE_H__
+
+#include "arm_compute/core/IMultiImage.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for OpenCL multi-planar images */
+class ICLMultiImage : public IMultiImage
+{
+public:
+    /** Return a pointer to the requested OpenCL plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A pointer pointed to the OpenCL plane
+     */
+    virtual ICLImage *cl_plane(unsigned int index) = 0;
+    /** Return a constant pointer to the requested OpenCL plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A constant pointer pointed to the OpenCL plane
+     */
+    virtual const ICLImage *cl_plane(unsigned int index) const = 0;
+
+    // Inherited methods overridden:
+    IImage *plane(unsigned int index) override;
+    const IImage *plane(unsigned int index) const override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLMULTIIMAGE_H__ */
diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/arm_compute/core/CL/ICLSimple2DKernel.h
new file mode 100644
index 0000000000..a1366fb211
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimple2DKernel.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
+class ICLSimple2DKernel : public ICLSimpleKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/arm_compute/core/CL/ICLSimple3DKernel.h
new file mode 100644
index 0000000000..5e981027de
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimple3DKernel.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output.
+ *  Both input tensor and output tensor must have at least 3 dimensions.
+ */
+class ICLSimple3DKernel : public ICLSimple2DKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h
new file mode 100644
index 0000000000..e9fdb7fb8b
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimpleKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLEKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output */
+class ICLSimpleKernel : public ICLKernel
+{
+public:
+    /** Constructor. */
+    ICLSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLSimpleKernel(const ICLSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLSimpleKernel &operator=(const ICLSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    ICLSimpleKernel(ICLSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    ICLSimpleKernel &operator=(ICLSimpleKernel &&) = default;
+    /** Default destructor */
+    ~ICLSimpleKernel() = default;
+
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_ICLSIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
new file mode 100644
index 0000000000..abc0131379
--- /dev/null
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLTENSOR_H__
+#define __ARM_COMPUTE_ICLTENSOR_H__
+
+#include "arm_compute/core/ITensor.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL tensor */
+class ICLTensor : public ITensor
+{
+public:
+    ICLTensor();
+    ICLTensor(const ICLTensor &) = delete;
+    ICLTensor &operator=(const ICLTensor &) = delete;
+    ICLTensor(ICLTensor &&)                 = default;
+    ICLTensor &operator=(ICLTensor &&) = default;
+    virtual ~ICLTensor()               = default;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
+     *
+     * @return A reference to an OpenCL buffer containing the image's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+    /** Clear the contents of the tensor synchronously.
+     *
+     * @param[in,out] q The CL command queue to use for the clear operation.
+     */
+    void clear(cl::CommandQueue &q);
+
+    // Inherited methods overridden:
+    uint8_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+
+using ICLImage = ICLTensor;
+}
+#endif /*__ARM_COMPUTE_ICLTENSOR_H__ */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
new file mode 100644
index 0000000000..2fae35c974
--- /dev/null
+++ b/arm_compute/core/CL/OpenCL.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OPENCL_H__
+#define __ARM_COMPUTE_OPENCL_H__
+
+/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
+
+namespace cl
+{
+static const NDRange Range_128_1 = NDRange(128, 1);
+}
+
+namespace arm_compute
+{
+bool opencl_is_available();
+}
+#endif /* __ARM_COMPUTE_OPENCL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000000..e8bd6aac7f
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the absolute difference kernel.
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class CLAbsoluteDifferenceKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~CLAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
+     * @param[out] output Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1. */
+    const ICLTensor *_input2; /**< Source tensor 2. */
+    ICLTensor       *_output; /**< Destination tensor. */
+};
+}
+#endif /* __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
new file mode 100644
index 0000000000..5c8ffdb404
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_CLACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the accumulate kernel.
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class CLAccumulateKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  input Source tensor. Data types supported: U8.
+     * @param[out] accum Destination tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *accum);
+};
+
+/** Interface for the accumulate weighted kernel.
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class CLAccumulateWeightedKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation images, and the scale value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
+};
+
+/** Interface for the accumulate squared kernel.
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class CLAccumulateSquaredKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+};
+}
+#endif /*__ARM_COMPUTE_CLACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
new file mode 100644
index 0000000000..490e70544b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the activation layer kernel. */
+class CLActivationLayerKernel : public ICLSimple3DKernel
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input    Source tensor. Data types supported: F16, F32, U16, S16.
+     * @param[out] output   Destination tensor. Data type should match the input data type.
+     * @param[in]  act_info Activation layer information.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+};
+}
+#endif /*__ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
new file mode 100644
index 0000000000..7d736cdf44
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic addition kernel
+ *
+ * Arithmetic addition is computed by:
+ * @f[ output(x,y) = input1(x,y) + input2(x,y) @f]
+ */
+class CLArithmeticAdditionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAdditionKernel(const CLArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAdditionKernel &operator=(const CLArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLArithmeticAdditionKernel(CLArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLArithmeticAdditionKernel &operator=(CLArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~CLArithmeticAdditionKernel() = default;
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
new file mode 100644
index 0000000000..afecf6ed7d
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic subtraction kernel
+ *
+ * Arithmetic subtraction is computed by:
+ * @f[ output(x,y) = input1(x,y) - input2(x,y) @f]
+ */
+class CLArithmeticSubtractionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtractionKernel(const CLArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtractionKernel &operator=(const CLArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLArithmeticSubtractionKernel(CLArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLArithmeticSubtractionKernel &operator=(CLArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~CLArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8,  S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000000..088853841b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class CLBatchNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. Data types supported: F32.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     *                     The rest are optional and used for representing batches.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_mean;
+    const ICLTensor *_var;
+    const ICLTensor *_beta;
+    const ICLTensor *_gamma;
+    float            _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
new file mode 100644
index 0000000000..624c422abc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEANDKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEANDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise AND operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class CLBitwiseAndKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseAndKernel(const CLBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseAndKernel &operator=(const CLBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel(CLBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel &operator=(CLBitwiseAndKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEANDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
new file mode 100644
index 0000000000..c9026022e1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISENOTKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISENOTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise NOT operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class CLBitwiseNotKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISENOTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
new file mode 100644
index 0000000000..fe8710fbc1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEORKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise OR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class CLBitwiseOrKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseOrKernel(const CLBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseOrKernel &operator=(const CLBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel(CLBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel &operator=(CLBitwiseOrKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
new file mode 100644
index 0000000000..f4e0b4df60
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEXORKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEXORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise XOR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class CLBitwiseXorKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseXorKernel(const CLBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseXorKernel &operator=(const CLBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel(CLBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel &operator=(CLBitwiseXorKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEXORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
new file mode 100644
index 0000000000..0960f7487a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBOX3X3KERNEL_H__
+#define __ARM_COMPUTE_CLBOX3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the box 3x3 filter kernel.
+ *
+ */
+class CLBox3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    //Inherited methods overriden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLBOX3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
new file mode 100644
index 0000000000..5ca3e03412
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__
+#define __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform Gradient computation.
+ */
+class CLGradientKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLGradientKernel(const CLGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and mag must all be the same size (either 16 or 32).
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
+     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
+     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Source tensor - Gx component */
+    const ICLTensor *_gy;        /**< Source tensor - Gy component */
+    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
+    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
+ *
+ * @note Hysteresis is computed in @ref CLEdgeTraceKernel
+ */
+class CLEdgeNonMaxSuppressionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16/U32.
+     * @param[in]  lower_thr        Lower threshold.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
+    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
+    ICLTensor       *_output;    /**< Destination tensor. */
+};
+
+/** OpenCL kernel to perform Edge tracing.
+ */
+class CLEdgeTraceKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]     input            Source tensor. Data types supported: U8.
+     * @param[out]    output           Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr        Upper threshold used for the hysteresis
+     * @param[in]     lower_thr        Lower threshold used for the hysteresis
+     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
+     *                                              Expected to be initialized to 0 before each run.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;            /**< Source tensor. */
+    ICLTensor       *_output;           /**< Destination tensor. */
+    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
+    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
+    ICLTensor       *_visited;          /**< Marks visited elements */
+    ICLTensor       *_recorded;         /**< Marks recorded elements */
+    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
+    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
+};
+}
+#endif /* __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
new file mode 100644
index 0000000000..3e718a2f1a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__
+#define __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel combine kernel */
+class CLChannelCombineKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelCombineKernel() = default;
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output The single planar output tensor.
+     */
+    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output The multi planar output tensor.
+     */
+    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    std::array<const ICLTensor *, 4> _planes;
+    ICLTensor     *_output;
+    ICLMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+};
+}
+#endif /* __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
new file mode 100644
index 0000000000..3e9e699a50
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__
+#define __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel extract kernel */
+class CLChannelExtractKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelExtractKernel() = default;
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor.
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Must be of U8 format.
+     */
+    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image.
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
+     */
+    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    uint32_t         _num_elems_processed_per_iteration;
+    uint32_t         _subsampling;
+};
+}
+#endif /* __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
new file mode 100644
index 0000000000..9d445e3004
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOL2IMKERNEL_H__
+#define __ARM_COMPUTE_CLCOL2IMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLCol2ImKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel(CLCol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
+    /** Default destructor */
+    ~CLCol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: F16, F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLCOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
new file mode 100644
index 0000000000..a88e2dcdf3
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__
+#define __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the color convert kernel.
+ *
+ */
+class CLColorConvertKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel(CLColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
+    /** Default destructor. */
+    ~CLColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor
+     * @param[out] output Destination tensor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  multi-planar source image
+     * @param[out] output single-planar destination image
+     */
+    void configure(const ICLMultiImage *input, ICLImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  single-planar source image
+     * @param[out] output multi-planar destination image
+     */
+    void configure(const ICLImage *input, ICLMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  multi-planar source image
+     * @param[out] output multi-planar destination image
+     */
+    void configure(const ICLMultiImage *input, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor     *_input;        /*pointer to single planar tensor input */
+    ICLTensor           *_output;       /*pointer to single planar tensor output */
+    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
+    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
new file mode 100644
index 0000000000..9c0908405a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__
+#define __ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class CLConvolutionKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
+using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
+/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
+using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
+/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
+using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
+/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
+using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default Constructor */
+    CLSeparableConvolutionHorKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+
+/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: S16.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class CLConvolutionRectangleKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize       _border_size;
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
new file mode 100644
index 0000000000..eda4c66883
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLDepthConcatenateKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel(const CLDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel &operator=(const CLDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel(CLDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel &operator=(CLDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~CLDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    int              _top_bottom;
+    int              _left_right;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertKernel.h
new file mode 100644
index 0000000000..2c3b1b8b69
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDepthConvertKernel.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth conversion kernel.
+ *
+ */
+class CLDepthConvertKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and output of the kernel.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - U8 -> U16, S16, U32, S32
+     *   - U16 -> U8, U32, S32
+     *   - S16 -> U8, U32, S32
+     *   - U32 -> U8, U16, S16
+     *   - S32 -> U8, U16, S16
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[out] output The output tensor. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[in]  policy Conversion policy
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+
+#endif /*__ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
new file mode 100644
index 0000000000..17552aefbe
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDERIVATIVEKERNEL_H__
+#define __ARM_COMPUTE_CLDERIVATIVEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the derivative kernel. */
+class CLDerivativeKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel(CLDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
+    /** Default destructor */
+    ~CLDerivativeKernel() = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;            /**< Input tensor */
+    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
+    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
+    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
+    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
+};
+}
+#endif /*__ARM_COMPUTE_CLDERIVATIVEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/arm_compute/core/CL/kernels/CLDilateKernel.h
new file mode 100644
index 0000000000..a5d3beb02f
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDilateKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDILATEKERNEL_H__
+#define __ARM_COMPUTE_CLDILATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the dilate kernel.
+ *
+ */
+class CLDilateKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLDILATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/arm_compute/core/CL/kernels/CLErodeKernel.h
new file mode 100644
index 0000000000..a43c925be6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLErodeKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLERODEKERNEL_H__
+#define __ARM_COMPUTE_CLERODEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the erode kernel.
+ *
+ */
+class CLErodeKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLERODEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
new file mode 100644
index 0000000000..9817b78ae0
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__
+#define __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** CL kernel to perform fast corners */
+class CLFastCornersKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel(CLFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
+    /** Default destructor */
+    ~CLFastCornersKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data types supported: U8.
+     * @param[out] output              Output image. Data types supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_mode         Strategy to use for borders.
+     */
+    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLImage *_input;
+    ICLImage       *_output;
+};
+
+/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
+class CLCopyToArrayKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCopyToArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
+    /** Default destructor */
+    ~CLCopyToArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input         Source image. Data types supported: U8.
+     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
+     * @param[out] corners       Array of keypoints to store the results.
+     * @param[out] num_buffers   Number of keypoints to store the results.
+     */
+    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage   *_input;      /**< source image */
+    ICLKeyPointArray *_corners;    /**< destination array */
+    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
+};
+}
+#endif /* __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
new file mode 100644
index 0000000000..797f86dae8
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_CLFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the border of a kernel */
+class CLFillBorderKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel(const CLFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel(CLFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~CLFillBorderKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in,out] tensor                Tensor to process Data types supported: U8, S16, S32, F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    /** Function to set the constant value on fill border kernel depending on type.
+     *
+     * @param[in] idx                   Index of the kernel argument to set.
+     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    template <class T>
+    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    bool is_parallelisable() const override;
+
+private:
+    ICLTensor *_tensor;
+};
+}
+#endif /*__ARM_COMPUTE_CLFILLBORDERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000000..3ac7b3c4fa
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
+#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which interleaves the elements of a matrix A in chunk of 4x4
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class CLGEMMInterleave4x4Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMInterleave4x4Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMInterleave4x4Kernel(const CLGEMMInterleave4x4Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMInterleave4x4Kernel &operator=(const CLGEMMInterleave4x4Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMInterleave4x4Kernel(CLGEMMInterleave4x4Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..f84d0638da
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to compute low precision matrix multiplication kernel
+ *
+ *  This kernel performs the following computation:
+ *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+ *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+ *  -# Compute the int32 matrix product of the resulting a * b.
+ *  -# Add output_offset to each entry of the result.
+ *  -# Multiply each entry of the result and round to the nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ */
+class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
+     * These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data types supported: U8
+     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
+     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+     * @param[in]  output_offset   Offset to be added to each element of the output matrix
+     * @param[in]  output_mult_int Offset to be added to each element of the output matrix
+     * @param[in]  shift           Number of bits to shift right the result.
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 0000000000..ea1db9f831
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAccumulateBiasesKernel &operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
+     */
+    void configure(ICLTensor *accum, const ICLTensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor       *_accum;
+    const ICLTensor *_biases;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000000..c808039567
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta.
+ *  The matrices must have the same dimensions
+ *
+ * @note This kernel is computed if and only if beta != 0.0.
+ */
+class CLGEMMMatrixAdditionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAdditionKernel(const CLGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAdditionKernel &operator=(const CLGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAdditionKernel(CLGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAdditionKernel &operator=(CLGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input, output and beta value
+     *
+     * @note The input and output tensors must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..07ea3c12ac
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLGEMMMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha  Weight of the matrix product
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000000..8d44a4c4fa
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which transposes the elements of a matrix in chunks of 1x4 if the input data type is F32 or in chunks of 1x8 if the input data type is F16.
+ *
+ * Following an example of how the transposition1xW works when the input data type is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ * @note If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ * @note If the input data type is U8, the output matrix will have the following shape: [ height * 16, width / 16 ]
+ *
+ */
+class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/F16/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
new file mode 100644
index 0000000000..028a10b421
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Gaussian 3x3 filter kernel.
+ *
+ */
+class CLGaussian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
new file mode 100644
index 0000000000..1484c06311
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__
+
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5HorKernel::configure;
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
+     * @param[out] output           Destination tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5VertKernel::configure;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
new file mode 100644
index 0000000000..6d79d0e718
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
+class CLGaussianPyramidHorKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Output should have half the input width. Data types supported: U16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+    int        _l2_load_offset;
+};
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
+class CLGaussianPyramidVertKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U16.
+     * @param[out] output           Destination tensor. Output should have half the input height. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
new file mode 100644
index 0000000000..45a5aac1bc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** OpenCL kernel to perform HOG Orientation Binning */
+class CLHOGOrientationBinningKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input_magnitude;
+    const ICLTensor *_input_phase;
+    ICLTensor       *_output;
+    Size2D           _cell_size;
+};
+
+/** OpenCL kernel to perform HOG block normalization */
+class CLHOGBlockNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _num_cells_per_block_stride;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
new file mode 100644
index 0000000000..47bd0549ee
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform HOG detector kernel using linear SVM */
+class CLHOGDetectorKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  num_detection_windows   Number of detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
+                   uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+    const ICLTensor         *_input;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer              *_num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
new file mode 100644
index 0000000000..d8057df8d1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__
+#define __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the harris score kernel.
+ *
+ * @note The implementation supports 3, 5, and 7 for the block_size.
+ */
+class CLHarrisScoreKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~CLHarrisScoreKernel() = default;
+
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
+     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
+     * @param[out] output           Destination image (harris score). Data types supported F32
+     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+protected:
+    const ICLImage *_input1;          /**< Source image - Gx component */
+    const ICLImage *_input2;          /**< Source image - Gy component */
+    ICLImage       *_output;          /**< Source image - Harris score */
+    float           _sensitivity;     /**< Sensitivity value */
+    float           _strength_thresh; /**< Threshold value */
+    float           _norm_factor;     /**< Normalization factor */
+    BorderSize      _border_size;     /**< Border size */
+};
+}
+#endif /* __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/arm_compute/core/CL/kernels/CLHistogramKernel.h
new file mode 100644
index 0000000000..b65e62d9a2
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHistogramKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__
+#define __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLDistribution1D;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
+ *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
+ */
+class CLHistogramKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel(const CLHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel(CLHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+
+/** Interface to run the histogram kernel to handle the leftover part of image
+ *
+ */
+class CLHistogramBorderKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__*/
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
new file mode 100644
index 0000000000..d2224b53e1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLIM2COLKERNEL_H__
+#define __ARM_COMPUTE_CLIM2COLKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLIm2ColKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel(CLIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16, F32
+     * @param[out] output         The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                            while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  convolved_dims The convolved output dimensions.
+     * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias       In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_reduced(const Window &window, cl::CommandQueue &queue);
+    /** run the generic convolution layer input reshape kernel
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_generic(const Window &window, cl::CommandQueue &queue);
+
+    /** Common signature for the kernel to run */
+    using Im2ColFunction = void (CLIm2ColKernel::*)(const Window &, cl::CommandQueue &);
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo  _conv_info;
+    int            _kernel_size;
+    unsigned int   _num_elems_processed_per_iteration;
+    Im2ColFunction _run_func;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
new file mode 100644
index 0000000000..0f53c2d2a8
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__
+#define __ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to run the horizontal pass of the integral image kernel. */
+class CLIntegralImageHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8
+     * @param[out] output Destination tensor, Data types supported: U32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+
+/** Interface to run the vertical pass of the integral image kernel. */
+class CLIntegralImageVertKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIntegralImageVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in,out] in_out The input/output tensor. Data types supported: U32
+     */
+    void configure(ICLTensor *in_out);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_in_out;
+};
+}
+#endif /*__ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
new file mode 100644
index 0000000000..4d0dbed55d
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLKTRACKERKERNEL_H__
+#define __ARM_COMPUTE_CLLKTRACKERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Internal keypoint structure for Lucas-Kanade Optical Flow */
+struct CLLKInternalKeypoint
+{
+    float x{ 0.f };               /**< x coordinate of the keypoint */
+    float y{ 0.f };               /**< y coordinate of the keypoint */
+    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
+    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
+struct CLCoefficientTable
+{
+    float A11;     /**< iA11 * FLT_SCALE */
+    float A12;     /**< iA11 * FLT_SCALE */
+    float A22;     /**< iA11 * FLT_SCALE */
+    float min_eig; /**< Minimum eigenvalue */
+};
+
+/** Structure for storing ival, ixval and iyval for each point inside the window */
+struct CLOldValue
+{
+    int16_t ival;  /**< ival extracts from old image */
+    int16_t ixval; /**< ixval extracts from scharr Gx image */
+    int16_t iyval; /**< iyval extracts from scharr Gy image */
+    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
+using ICLCoefficientTableArray   = ICLArray<CLCoefficientTable>;
+using ICLOldValArray             = ICLArray<CLOldValue>;
+
+/** Interface to run the initialization step of LKTracker */
+class CLLKTrackerInitKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
+     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
+     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
+     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]  level                The pyramid level
+     * @param[in]  num_levels           The number of pyramid levels
+     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
+class CLLKTrackerFinalizeKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
+     */
+    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
+class CLLKTrackerStage0Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage0Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
+     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
+     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
+     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
+     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[out]     old_ival            Pointer to the array holding internal values
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_old_input;
+    const ICLTensor *_old_scharr_gx;
+    const ICLTensor *_old_scharr_gy;
+};
+
+/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
+class CLLKTrackerStage1Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage1Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
+     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[in]      old_ival            Pointer to the array holding internal values
+     * @param[in]      termination         The criteria to terminate the search of each keypoint.
+     * @param[in]      epsilon             The error for terminating the algorithm
+     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_new_input;
+};
+}
+#endif /*__ARM_COMPUTE_CLLKTRACKERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..fda0327461
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
new file mode 100644
index 0000000000..a8e1dcb361
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__
+#define __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Template interface for the kernel to compute magnitude and phase.
+ *
+ */
+class CLMagnitudePhaseKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLMagnitudePhaseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of output1 or output2 must be set.
+     *
+     * @param[in]  gx         The input gradient X tensor. Data types supported: S16.
+     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16.
+     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16.
+     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
+     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Input gradient X. */
+    const ICLTensor *_gy;        /**< Input gradient Y. */
+    ICLTensor       *_magnitude; /**< Output - Magnitude. */
+    ICLTensor       *_phase;     /**< Output - Phase. */
+    bool             _run_mag;   /**< Calculate magnitude ? */
+    bool             _run_phase; /**< Calculate phase ? */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
new file mode 100644
index 0000000000..9f30f76e1b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__
+#define __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class CLMeanStdDevKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data types supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
+     */
+    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage *_input;
+    float          *_mean;
+    float          *_stddev;
+    cl::Buffer     *_global_sum;
+    cl::Buffer     *_global_sum_squared;
+};
+}
+#endif /* __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
new file mode 100644
index 0000000000..5af364b6c6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__
+#define __ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the median 3x3 filter kernel.
+ *
+ */
+class CLMedian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
new file mode 100644
index 0000000000..6a31f3cf18
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__
+#define __ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to perform min max search on an image.
+ */
+class CLMinMaxKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel(CLMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input   Input Image. Data types supported: U8 or S16.
+     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;               /**< Input image. */
+    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
+    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
+};
+
+/** Interface for the kernel to find min max locations of an image.
+ */
+class CLMinMaxLocationKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  input         Input image. Data types supported: U8 or S16.
+     * @param[in]  min_max       Buffer of 2 elements which contains the min value at position 0 and the max value at position 1. Data type supported: S32
+     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
+     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
+                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage *_input;         /**< Input image. */
+    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
+};
+}
+#endif /*__ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
new file mode 100644
index 0000000000..0c59063bbc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__
+#define __ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class CLNonLinearFilterKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default constructor */
+    CLNonLinearFilterKernel();
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8
+     * @param[out] output           Destination tensor. Data types supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+}
+#endif /*__ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000000..1719bbbb47
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__
+#define __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
+ *
+ * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
+ */
+class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
+     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /* __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
new file mode 100644
index 0000000000..ca9034b162
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16, F32.
+     * @param[in]  squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data types should match the input type.
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type.
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_squared_input;
+    ICLTensor       *_output;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000000..6fbbe95219
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise multiplication kernel.
+ *
+ */
+class CLPixelWiseMultiplicationKernel : public ICLKernel
+{
+public:
+    /** Default constructor.*/
+    CLPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1          An input tensor. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8, S16, F16, F32.
+     * @param[out] output          The output tensor, Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
new file mode 100644
index 0000000000..546a40b15e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLPoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel(const CLPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel &operator=(const CLPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel(CLPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel &operator=(CLPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16, F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    PoolingLayerInfo _pool_info;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/arm_compute/core/CL/kernels/CLRemapKernel.h
new file mode 100644
index 0000000000..7cebf2e817
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLRemapKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLREMAPKERNEL_H__
+#define __ARM_COMPUTE_CLREMAPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a remap on a tensor */
+class CLRemapKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLRemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLRemapKernel(const CLRemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel(CLRemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel &operator=(CLRemapKernel &&) = default;
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
+     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
+     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy           The interpolation type.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_map_x;
+    const ICLTensor *_map_y;
+};
+}
+#endif /*__ARM_COMPUTE_CLREMAPKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
new file mode 100644
index 0000000000..e74a7cb82a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCALEKERNEL_H__
+#define __ARM_COMPUTE_CLSCALEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the warp affine kernel.*/
+class CLScaleKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8, S16.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  policy           Interpolation type to use
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
new file mode 100644
index 0000000000..fe245cc351
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHARR3X3KERNEL_H__
+#define __ARM_COMPUTE_CLSCHARR3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -3 & 0 & +3\\
+ *      -10& 0 & +10\\
+ *      -3 & 0 & +3
+ *      \end{vmatrix}
+ * @f]
+ * @f[
+ *      \mathbf{G}_y=\begin{vmatrix}
+ *      -3 & -10 & -3\\
+ *       0 & 0 & 0\\
+ *      +3 & +10 & +3
+ *      \end{vmatrix}
+ * @f]
+ */
+class CLScharr3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ICLTensor *_input;        /**< Input image */
+    ICLTensor       *_output_x;     /**< Output image for scharr X */
+    ICLTensor       *_output_y;     /**< Output image for scharr Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSCHARR3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
new file mode 100644
index 0000000000..9edeb6ceff
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL3X3KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
+class CLSobel3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~CLSobel3x3Kernel() = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
+    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
new file mode 100644
index 0000000000..e90f8f587e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL5X5KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL5X5KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL5X5KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
new file mode 100644
index 0000000000..e5ef8444ee
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL7X7KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL7X7KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL7X7KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
new file mode 100644
index 0000000000..0806974ad6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class CLLogits1DMaxKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class CLLogits1DShiftExpSumKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DShiftExpSumKernel(const CLLogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DShiftExpSumKernel &operator=(const CLLogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DShiftExpSumKernel(CLLogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[in]  max    Max values tensor. Matching input type and channel number.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     * @param[out] sum    Sum of 1D logits tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_max;
+    ICLTensor       *_output;
+    ICLTensor       *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class CLLogits1DNormKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Matching input type and channel number.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_sum;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
new file mode 100644
index 0000000000..477f58dc38
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class ICLLut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class CLTableLookupKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8, S16.
+     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
+     * @param[out] output The output tensor. Data types supported: U8, S16.
+     */
+    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/arm_compute/core/CL/kernels/CLThresholdKernel.h
new file mode 100644
index 0000000000..d7a6ae2cdb
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLThresholdKernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTHRESHOLDKERNEL_H__
+#define __ARM_COMPUTE_CLTHRESHOLDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the thresholding kernel.
+ *
+ */
+class CLThresholdKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input       An input tensor. Data types supported: U8
+     * @param[out] output      The output tensor. Data types supported: U8.
+     * @param[in]  threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
new file mode 100644
index 0000000000..9ad183f8f1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class CLTransposeKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
new file mode 100644
index 0000000000..05d6d0a8f7
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPAFFINEKERNEL_H__
+#define __ARM_COMPUTE_CLWARPAFFINEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the warp affine kernel.*/
+class CLWarpAffineKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPAFFINEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
new file mode 100644
index 0000000000..5c5013c599
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__
+#define __ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the warp perspective kernel.*/
+class CLWarpPerspectiveKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
new file mode 100644
index 0000000000..1dc8a8b80e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class CLWeightsReshapeKernel : public ICLKernel
+{
+public:
+    /** Constructor.
+     *
+     * @param[in] is_shared Flag to indicate whether the weights are shared or not.
+     */
+    CLWeightsReshapeKernel(bool is_shared = false);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~CLWeightsReshapeKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: F16, F32
+     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
+
+    // Inherited methods overridden:
+    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+
+protected:
+    bool             _is_shared;
+    const ICLTensor *_input;
+    const ICLTensor *_biases;
+    ICLTensor       *_output;
+};
+
+/** Interface for the weights reshape kernel used by convolution and fully connected layers.
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref CLIm2ColKernel can transform a convolution into a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLConvolutionLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
+{
+public:
+    /** Default constructor */
+    CLConvolutionLayerWeightsReshapeKernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface for the weights reshape kernel used by locally connected layers. */
+class CLLocallyConnectedLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayerWeightsReshapeKernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
new file mode 100644
index 0000000000..1eabfa9437
--- /dev/null
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPKERNELS_H__
+#define __ARM_COMPUTE_CPPKERNELS_H__
+
+/* Header regrouping all the CPP kernels */
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+
+#endif /* __ARM_COMPUTE_CPPKERNELS_H__ */
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
new file mode 100644
index 0000000000..99ae68f2e5
--- /dev/null
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICPPKERNEL_H__
+#define __ARM_COMPUTE_ICPPKERNEL_H__
+
+#include "arm_compute/core/IKernel.h"
+
+namespace arm_compute
+{
+class Window;
+
+/** Common interface for all kernels implemented in C++ */
+class ICPPKernel : public IKernel
+{
+public:
+    /** Default destructor */
+    virtual ~ICPPKernel() = default;
+
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
+     */
+    virtual void run(const Window &window) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_ICPPKERNEL_H__ */
diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h
new file mode 100644
index 0000000000..105de397a2
--- /dev/null
+++ b/arm_compute/core/CPP/ICPPSimpleKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICPPSIMPLEKERNEL_H__
+#define __ARM_COMPUTE_ICPPSIMPLEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
+class ICPPSimpleKernel : public ICPPKernel
+{
+public:
+    /** Constructor */
+    ICPPSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICPPSimpleKernel(const ICPPSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICPPSimpleKernel &operator=(const ICPPSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICPPSimpleKernel(ICPPSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICPPSimpleKernel &operator=(ICPPSimpleKernel &&) = default;
+    /** Default destructor */
+    ~ICPPSimpleKernel() = default;
+
+protected:
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_ICPPSIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
new file mode 100644
index 0000000000..0866d4ee57
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__
+#define __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** CPP kernel to perform corner candidates
+ */
+class CPPCornerCandidatesKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    CPPCornerCandidatesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPCornerCandidatesKernel(const CPPCornerCandidatesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPCornerCandidatesKernel &operator=(const CPPCornerCandidatesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPCornerCandidatesKernel(CPPCornerCandidatesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPCornerCandidatesKernel &operator=(CPPCornerCandidatesKernel &&) = default;
+    /** Default destructor */
+    ~CPPCornerCandidatesKernel() = default;
+
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input                 Source image (harris score). Format supported F32
+     * @param[out] output                Destination array of InternalKeypoint
+     * @param[out] num_corner_candidates Number of corner candidates
+     */
+    void configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    int32_t          *_num_corner_candidates;   /**< Number of corner candidates */
+    std::mutex        _corner_candidates_mutex; /**< Mutex to preventing race conditions */
+    const IImage     *_input;                   /**< Source image - Harris score */
+    InternalKeypoint *_output;                  /**< Array of NEInternalKeypoint */
+};
+} //namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
new file mode 100644
index 0000000000..bcb3026959
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
+#define __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** CPP kernel to perform in-place computation of euclidean distance on IDetectionWindowArray
+ *
+ * @note This kernel is meant to be used alongside HOG or other object detection algorithms to perform a non-maxima suppression on a
+ *       IDetectionWindowArray
+ */
+class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
+{
+public:
+    /** Default constructor */
+    CPPDetectionWindowNonMaximaSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPDetectionWindowNonMaximaSuppressionKernel(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
+    /** Initialise the kernel's input, output and the euclidean minimum distance
+     *
+     * @attention: If @ref CLDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref CLDetectionWindowArray must be called respectively before and after
+     *             the run() method of @ref CPPDetectionWindowNonMaximaSuppressionKernel
+     *
+     * @param[in, out] input_output Input/Output array of @ref DetectionWindow
+     * @param[in]      min_distance Radial Euclidean distance for non-maxima suppression
+     */
+    void configure(IDetectionWindowArray *input_output, float min_distance);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    IDetectionWindowArray *_input_output;
+    float                  _min_distance;
+};
+}
+
+#endif /* __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
new file mode 100644
index 0000000000..b7a7d9ff9f
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
+#define __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/IArray.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+/** CPP kernel to perform sorting and euclidean distance */
+class CPPSortEuclideanDistanceKernel : public ICPPKernel
+{
+public:
+    /** Default constructor */
+    CPPSortEuclideanDistanceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPSortEuclideanDistanceKernel(const CPPSortEuclideanDistanceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPSortEuclideanDistanceKernel &operator=(const CPPSortEuclideanDistanceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPSortEuclideanDistanceKernel(CPPSortEuclideanDistanceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPSortEuclideanDistanceKernel &operator=(CPPSortEuclideanDistanceKernel &&) = default;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] in_out                Input internal keypoints. Marked as out as the kernel writes 0 in the strength member.
+     * @param[out]    output                Output keypoints.
+     * @param[in]     num_corner_candidates Pointer to the number of corner candidates in the input array
+     * @param[in]     min_distance          Radial Euclidean distance to use
+     */
+    void configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const int32_t    *_num_corner_candidates; /**< Number of corner candidates */
+    float             _min_distance;          /**< Radial Euclidean distance */
+    InternalKeypoint *_in_out;                /**< Source array of InternalKeypoint */
+    IKeyPointArray   *_output;                /**< Destination array of IKeyPointArray */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__ */
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
new file mode 100644
index 0000000000..3a99abbd74
--- /dev/null
+++ b/arm_compute/core/Coordinates.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_COORDINATES_H__
+#define __ARM_COMPUTE_COORDINATES_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Coordinates of an item */
+class Coordinates : public Dimensions<int>
+{
+public:
+    /** Constructor to initialize the coordinates.
+     *
+     * @param[in] coords Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    constexpr Coordinates(Ts... coords)
+        : Dimensions{ coords... }
+    {
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Coordinates(const Coordinates &) = default;
+    /** Allow instances of this class to be copied */
+    Coordinates &operator=(const Coordinates &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Coordinates(Coordinates &&) = default;
+    /** Allow instances of this class to be moved */
+    Coordinates &operator=(Coordinates &&) = default;
+    /** Default destructor */
+    ~Coordinates() = default;
+};
+}
+#endif /*__ARM_COMPUTE_COORDINATES_H__*/
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
new file mode 100644
index 0000000000..b080435b69
--- /dev/null
+++ b/arm_compute/core/Dimensions.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DIMENSIONS_H__
+#define __ARM_COMPUTE_DIMENSIONS_H__
+
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <numeric>
+
+namespace arm_compute
+{
+/* Constant value used to indicate maximum dimensions of a Window, TensorShape and Coordinates */
+constexpr size_t MAX_DIMS = 6;
+
+/** Dimensions with dimensionality */
+template <typename T>
+class Dimensions
+{
+public:
+    /** Number of dimensions the tensor has */
+    static constexpr size_t num_max_dimensions = MAX_DIMS;
+
+    /** Constructor to initialize the tensor shape.
+     *
+     * @param[in] dims Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    Dimensions(Ts... dims)
+        : _id{ { dims... } }, _num_dimensions{ sizeof...(dims) }
+    {
+    }
+
+    /** Allow instances of this class to be copy constructed */
+    Dimensions(const Dimensions &) = default;
+
+    /** Allow instances of this class to be copied */
+    Dimensions &operator=(const Dimensions &) = default;
+
+    /** Allow instances of this class to be move constructed */
+    Dimensions(Dimensions &&) = default;
+
+    /** Allow instances of this class to be moved */
+    Dimensions &operator=(Dimensions &&) = default;
+
+    /** Accessor to set the value of one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] value     Value to be set for the dimension.
+     */
+    void set(size_t dimension, T value)
+    {
+        ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
+        _id[dimension]  = value;
+        _num_dimensions = std::max(_num_dimensions, dimension + 1);
+    }
+    /** Alias to access the size of the first dimension */
+    T x() const
+    {
+        return _id[0];
+    }
+    /** Alias to access the size of the second dimension */
+    T y() const
+    {
+        return _id[1];
+    }
+    /** Alias to access the size of the third dimension */
+    T z() const
+    {
+        return _id[2];
+    }
+    /** Generic accessor to get the size of any dimension
+     *
+     * @note Precondition: dimension < Dimensions::num_max_dimensions
+     *
+     * @param[in] dimension Dimension of the wanted size
+     *
+     * @return The size of the requested dimension.
+     */
+    T operator[](size_t dimension) const
+    {
+        ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
+        return _id[dimension];
+    }
+    /** Returns the effective dimensionality of the tensor */
+    unsigned int num_dimensions() const
+    {
+        return _num_dimensions;
+    }
+
+    /** Set number of dimensions */
+    void set_num_dimensions(size_t num_dimensions)
+    {
+        _num_dimensions = num_dimensions;
+    }
+
+    /** Collapse dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        ARM_COMPUTE_ERROR_ON(first + n > _id.size());
+
+        // Collapse dimensions into the first
+        _id[first] = std::accumulate(_id.cbegin() + first, _id.cbegin() + first + n, 1, std::multiplies<T>());
+        // Shift the remaining dimensions down
+        std::copy(_id.begin() + first + n, _id.end(), _id.begin() + first + 1);
+        // Reduce the number of dimensions
+        _num_dimensions -= n - 1;
+        // Fill the now empty dimensions with zero
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 0);
+    }
+
+    /** Returns a read/write iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::iterator begin()
+    {
+        return _id.begin();
+    }
+    /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator begin() const
+    {
+        return _id.begin();
+    }
+    /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator cbegin() const
+    {
+        return begin();
+    }
+    /** Returns a read/write iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::iterator end()
+    {
+        return _id.end();
+    }
+    /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator end() const
+    {
+        return _id.end();
+    }
+    /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator cend() const
+    {
+        return end();
+    }
+
+protected:
+    /** Protected destructor. */
+    ~Dimensions() = default;
+
+    std::array<T, num_max_dimensions> _id;
+    size_t _num_dimensions{ 0 };
+};
+}
+#endif /*__ARM_COMPUTE_DIMENSIONS_H__*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
new file mode 100644
index 0000000000..c4c452bacf
--- /dev/null
+++ b/arm_compute/core/Error.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ERROR_H__
+#define __ARM_COMPUTE_ERROR_H__
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] ... Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT
+
+/** To avoid unused variables warnings
+ *
+ * This is useful if for example a variable is only used
+ * in debug builds and generates a warning in release builds.
+ *
+ * @param[in] var Variable which is unused
+ */
+#define ARM_COMPUTE_UNUSED(var) (void)(var)
+
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+/** Print the given message
+ *
+ * @param[in] ... Message to display
+ */
+#define ARM_COMPUTE_INFO(...) ::arm_compute::debug(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+/** If the condition is true, the given message is printed
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) \
+    do                                     \
+    {                                      \
+        if(cond)                           \
+        {                                  \
+            ARM_COMPUTE_INFO(__VA_ARGS__); \
+        }                                  \
+    } while(0)
+#else /* ARM_COMPUTE_DEBUG_ENABLED */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...)
+#define ARM_COMPUTE_INFO(...)
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+/** If the condition is true, the given message is printed and an exception is thrown
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, ...) \
+    do                                      \
+    {                                       \
+        if(cond)                            \
+        {                                   \
+            ARM_COMPUTE_ERROR(__VA_ARGS__); \
+        }                                   \
+    } while(0)
+
+/** If the condition is true, the given message is printed and an exception is thrown
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...) \
+    do                                                            \
+    {                                                             \
+        if(cond)                                                  \
+        {                                                         \
+            ARM_COMPUTE_ERROR_LOC(func, file, line, __VA_ARGS__); \
+        }                                                         \
+    } while(0)
+
+/** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] val  Value to be returned.
+ * @param[in] msg  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) (cond) ? throw std::logic_error(msg) : val;
+#else /* ARM_COMPUTE_ASSERTS_ENABLED */
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, ...)
+#define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)
+#define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) val
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+/** If the condition is true then an error message is printed and an exception thrown
+ *
+ * @param[in] cond Condition to evaluate
+ */
+#define ARM_COMPUTE_ERROR_ON(cond) \
+    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+
+/** If the condition is true then an error message is printed and an exception thrown
+ *
+ * @param[in] cond Condition to evaluate
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ */
+#define ARM_COMPUTE_ERROR_ON_LOC(cond, func, file, line) \
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, #cond)
+
+namespace arm_compute
+{
+/** Print an error message then throw an std::runtime_error
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...);
+
+/** Print a debug message
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+void debug(const char *function, const char *file, const int line, const char *msg, ...);
+}
+
+#endif /* __ARM_COMPUTE_ERROR_H__ */
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
new file mode 100644
index 0000000000..925b4949a3
--- /dev/null
+++ b/arm_compute/core/FixedPoint.h
@@ -0,0 +1,217 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_FIXEDPOINT_H__
+#define __ARM_COMPUTE_FIXEDPOINT_H__
+
+#include <cstdint>
+
+namespace arm_compute
+{
+using qint8_t  = int8_t;  /**< 8 bit fixed point scalar value */
+using qint16_t = int16_t; /**< 16 bit fixed point scalar value */
+using qint32_t = int32_t; /**< 32 bit fixed point scalar value */
+
+/** 8 bit fixed point scalar saturating shift left
+ *
+ * @param[in] a     First 8 bit fixed point input
+ * @param[in] shift Shift amount
+ *
+ * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow
+ */
+qint8_t sqshl_qs8(qint8_t a, int shift);
+
+/** 8 bit fixed point scalar absolute value
+ *
+ * @param[in] a 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point absolute value
+ */
+qint8_t sabs_qs8(qint8_t a);
+
+/** 8 bit fixed point scalar add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition
+ */
+qint8_t sadd_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint8_t sqadd_qs8(qint8_t a, qint8_t b);
+
+/** 16 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint16_t sqadd_qs16(qint16_t a, qint16_t b);
+
+/** 8 bit fixed point scalar subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction
+ */
+qint8_t ssub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction. The result is saturated in case of overflow
+ */
+qint8_t sqsub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication.
+ */
+qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar saturating multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication. The result is saturated in case of overflow
+ */
+qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar multiply long
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication long. The result is saturated in case of overflow
+ */
+qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 16 bit fixed point scalar saturating multiply
+*
+* @param[in] a                    First 16 bit fixed point input
+* @param[in] b                    Second 16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
+*/
+qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar inverse square root
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point inverse square root.
+*/
+qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar division
+*
+* @param[in] a                    First 8 bit fixed point input
+* @param[in] b                    Second 8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point division.
+*/
+qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar exponential
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point exponential.
+*/
+qint8_t sexp_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar logarithm
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point logarithm.
+*/
+qint8_t slog_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert an 8 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float
+ */
+float scvt_f32_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert a float to 8 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8_t scvt_qs8_f32(float a, int fixed_point_position);
+
+/** Convert a 16 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 16 bit fixed point -> float
+ */
+float scvt_f32_qs16(qint16_t a, int fixed_point_position);
+
+/** Convert a float to 16 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 16 bit fixed point
+ */
+qint8_t scvt_qs16_f32(float a, int fixed_point_position);
+
+/** Scalar saturating move and narrow.
+ *
+ * @param[in] a Input to convert to 8 bit fixed point
+ *
+ * @return The narrowing conversion to 8 bit
+ */
+qint8_t sqmovn_qs16(qint16_t a);
+}
+#include "arm_compute/core/FixedPoint.inl"
+#endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
new file mode 100644
index 0000000000..4263a6f00d
--- /dev/null
+++ b/arm_compute/core/FixedPoint.inl
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+namespace
+{
+template <typename TpIn, typename TpSat>
+inline TpSat saturate_convert(TpIn a)
+{
+    if(a > std::numeric_limits<TpSat>::max())
+    {
+        a = std::numeric_limits<TpSat>::max();
+    }
+    if(a < std::numeric_limits<TpSat>::min())
+    {
+        a = std::numeric_limits<TpSat>::min();
+    }
+    return static_cast<TpSat>(a);
+}
+} // namespace
+
+namespace arm_compute
+{
+inline qint8_t sqshl_qs8(qint8_t a, int shift)
+{
+    qint16_t tmp = static_cast<qint16_t>(a) << shift;
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t sabs_qs8(qint8_t a)
+{
+    return a & 0x7F;
+}
+
+inline qint8_t sadd_qs8(qint8_t a, qint8_t b)
+{
+    return a + b;
+}
+
+inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = (static_cast<qint16_t>(a) + static_cast<qint16_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint32_t tmp = (static_cast<qint32_t>(a) + static_cast<qint32_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint16_t
+    return saturate_convert<qint32_t, qint16_t>(tmp);
+}
+
+inline qint8_t ssub_qs8(qint8_t a, qint8_t b)
+{
+    return a - b;
+}
+
+inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = static_cast<qint16_t>(a) - static_cast<qint16_t>(b);
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return static_cast<qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint16_t, qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint32_t, qint16_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return tmp >> fixed_point_position;
+}
+
+inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position)
+{
+    qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
+
+    qint8_t const_three = (3 << fixed_point_position);
+    qint8_t temp        = shift < 0 ? (a << -shift) : (a >> shift);
+    qint8_t x2          = temp;
+
+    // We need three iterations to find the result
+    for(int i = 0; i < 3; i++)
+    {
+        qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position));
+        x2                     = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1);
+    }
+
+    temp = shift < 0 ? (x2 << (-shift >> 1)) : (x2 >> (shift >> 1));
+
+    return temp;
+}
+
+inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    qint16_t temp = a << fixed_point_position;
+    return (qint8_t)(temp / b);
+}
+
+inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t inv_ln2   = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
+    qint8_t A         = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t B         = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t C         = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t D         = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
+
+    // Polynomial expansion
+    int     dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
+    qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
+    qint8_t sum   = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
+    sum           = sqmul_qs8(alpha, sum, fixed_point_position);
+    sum           = sqadd_qs8(sum, const_one);
+
+    return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a);
+}
+
+inline qint8_t slog_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = (0x58 >> (7 - fixed_point_position));
+    qint8_t A         = (0x5C >> (7 - fixed_point_position - 1));
+    qint8_t B         = -(0x56 >> (7 - fixed_point_position));
+    qint8_t C         = (0x29 >> (7 - fixed_point_position));
+    qint8_t D         = -(0x0A >> (7 - fixed_point_position));
+
+    if((const_one == a) || (a < 0))
+    {
+        return 0;
+    }
+    else if(a < const_one)
+    {
+        return -slog_qs8(sdiv_qs8(const_one, a, fixed_point_position), fixed_point_position);
+    }
+
+    // Remove even powers of 2
+    qint8_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
+    a >>= shift_val;
+    a = ssub_qs8(a, const_one);
+
+    // Polynomial expansion
+    auto sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
+    sum      = sqmul_qs8(a, sum, fixed_point_position);
+
+    return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
+}
+
+inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint8_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs16_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint16_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline qint8_t sqmovn_qs16(qint16_t a)
+{
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(a);
+}
+}
diff --git a/arm_compute/core/HOGInfo.h b/arm_compute/core/HOGInfo.h
new file mode 100644
index 0000000000..654629306d
--- /dev/null
+++ b/arm_compute/core/HOGInfo.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HOGINFO_H__
+#define __ARM_COMPUTE_HOGINFO_H__
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the HOG's metadata */
+class HOGInfo
+{
+public:
+    /** Default constructor */
+    HOGInfo();
+    /** Default destructor */
+    virtual ~HOGInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    HOGInfo(const HOGInfo &) = default;
+    /** Allow instances of this class to be copied */
+    HOGInfo &operator=(const HOGInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    HOGInfo(HOGInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    HOGInfo &operator=(HOGInfo &&) = default;
+    /** Constructor
+     *
+     * @param[in] cell_size             Cell size in pixels
+     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
+     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
+     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
+     * @param[in] num_bins              Number of histogram bins for each cell
+     * @param[in] normalization_type    (Optional) Normalization type to use for each block
+     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
+     * @param[in] phase_type            (Optional) Type of @ref PhaseType
+     */
+    HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+            HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] cell_size             Cell size in pixels
+     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
+     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
+     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
+     * @param[in] num_bins              Number of histogram bins for each cell
+     * @param[in] normalization_type    (Optional) Normalization type to use for each block
+     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
+     * @param[in] phase_type            (Optional) Type of @ref PhaseType
+     */
+    void init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+              HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
+    /** The cell size in pixels
+     *
+     * @return The cell size in pixels
+     */
+    const Size2D &cell_size() const;
+    /** The block size in pixels
+     *
+     * @return The block size in pixels
+     */
+    const Size2D &block_size() const;
+    /** The detection window size in pixels
+     *
+     * @return The detection window size in pixels
+     */
+    const Size2D &detection_window_size() const;
+    /** The block stride in pixels. The block stride is the distance between 2 consecutive blocks
+     *
+     * @return The block stride in pixels
+     */
+    const Size2D &block_stride() const;
+    /** The number of histogram bins for each cell
+     *
+     * @return The number of histogram bins for each cell
+     */
+    size_t num_bins() const;
+    /** The normalization type
+     *
+     * @return The normalization type
+     */
+    HOGNormType normalization_type() const;
+    /** Threshold used for L2HYS_NORM normalization type
+     *
+     * @return Threshold used for L2HYS_NORM normalization type
+     */
+    float l2_hyst_threshold() const;
+    /** The type of @ref PhaseType
+     *
+     * @return The type of @ref PhaseType
+     */
+    PhaseType phase_type() const;
+    /** The size of HOG descriptor
+     *
+     * @return The size of HOG descriptor
+     */
+    size_t descriptor_size() const;
+    /** Calculates the number of cells for each block
+     *
+     * @return The Size2D data object which stores the number of cells along the x and y directions
+     */
+    Size2D num_cells_per_block() const;
+    /** Calculates the number of blocks for the given image size
+     *
+     * @param[in] image_size The input image size data object
+     *
+     * @return The Size2D data object which stores the number of blocks along the x and y directions
+     */
+    Size2D num_blocks_per_image(const Size2D &image_size) const;
+
+private:
+    Size2D      _cell_size;
+    Size2D      _block_size;
+    Size2D      _detection_window_size;
+    Size2D      _block_stride;
+    size_t      _num_bins;
+    HOGNormType _normalization_type;
+    float       _l2_hyst_threshold;
+    PhaseType   _phase_type;
+    size_t      _descriptor_size;
+};
+}
+#endif /*__ARM_COMPUTE_HOGINFO_H__ */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
new file mode 100644
index 0000000000..07318eaf7a
--- /dev/null
+++ b/arm_compute/core/Helpers.h
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HELPERS_H__
+#define __ARM_COMPUTE_HELPERS_H__
+
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace arm_compute
+{
+class IKernel;
+class ITensor;
+class ITensorInfo;
+
+namespace cpp14
+{
+template <class T>
+struct _Unique_if
+{
+    typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]>
+{
+    typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]>
+{
+    typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object
+make_unique(Args &&... args)
+{
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound
+make_unique(size_t n)
+{
+    typedef typename std::remove_extent<T>::type U;
+    return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound
+make_unique(Args &&...) = delete;
+}
+
+template <typename T>
+struct enable_bitwise_ops
+{
+    static constexpr bool value = false;
+};
+
+template <typename T>
+typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
+{
+    using underlying_type = typename std::underlying_type<T>::type;
+    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
+}
+
+namespace traits
+{
+/** Check if a type T is contained in a tuple Tuple of types */
+template <typename T, typename Tuple>
+struct is_contained;
+
+template <typename T>
+struct is_contained<T, std::tuple<>> : std::false_type
+{
+};
+
+template <typename T, typename... Ts>
+struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
+{
+};
+
+template <typename T, typename U, typename... Ts>
+struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
+{
+};
+}
+
+/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value. Format: Single channel U8
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy);
+
+/** Return the pixel at (x,y) using bilinear interpolation. The image must be single channel U8
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image;
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y);
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y);
+
+/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @note The interpolation area depends on the width and height ration of the input and output images
+ * @note Currently average of the contributing pixels is calculated
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] wr              Width ratio among the input image width and output image width.
+ * @param[in] hr              Height ratio among the input image height and output image height.
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using area interpolation.
+ */
+inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y);
+
+/** Performs clamping among a lower and upper value.
+ *
+ * @param[in] n     Value to clamp.
+ * @param[in] lower Lower threshold.
+ * @param[in] upper Upper threshold.
+ *
+ *  @return Clamped value.
+ */
+template <typename T>
+inline T clamp(const T &n, const T &lower, const T &upper)
+{
+    return std::max(lower, std::min(n, upper));
+}
+
+/** Base case of for_each. Does nothing. */
+template <typename F>
+inline void for_each(F &&)
+{
+}
+
+/** Call the function for each of the arguments
+ *
+ * @param[in] func Function to be called
+ * @param[in] arg  Argument passed to the function
+ * @param[in] args Remaining arguments
+ */
+template <typename F, typename T, typename... Ts>
+inline void for_each(F &&func, T &&arg, Ts &&... args)
+{
+    func(arg);
+    for_each(func, args...);
+}
+
+/** Base case of foldl.
+ *
+ * @return value.
+ */
+template <typename F, typename T>
+inline T foldl(F &&, const T &value)
+{
+    return value;
+}
+
+/** Base case of foldl.
+ *
+ * @return Function evaluation for value1 and value2
+ */
+template <typename F, typename T, typename U>
+inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
+{
+    return func(value1, value2);
+}
+
+/** Fold left.
+ *
+ * @param[in] func    Function to be called
+ * @param[in] initial Initial value
+ * @param[in] value   Argument passed to the function
+ * @param[in] values  Remaining arguments
+ */
+template <typename F, typename I, typename T, typename... Vs>
+inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
+{
+    return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
+}
+
+/** Iterator updated by @ref execute_window_loop for each window element */
+class Iterator
+{
+public:
+    /** Default constructor to create an empty iterator */
+    constexpr Iterator();
+    /** Create a container iterator for the metadata and allocation contained in the ITensor
+     *
+     * @param[in] tensor The tensor to associate to the iterator.
+     * @param[in] window The window which will be used to iterate over the tensor.
+     */
+    Iterator(const ITensor *tensor, const Window &window);
+
+    /** Increment the iterator along the specified dimension of the step value associated to the dimension.
+     *
+     * @warning It is the caller's responsibility to call increment(dimension+1) when reaching the end of a dimension, the iterator will not check for overflow.
+     *
+     * @note When incrementing a dimension 'n' the coordinates of all the dimensions in the range (0,n-1) are reset. For example if you iterate over a 2D image, everytime you change row (dimension 1), the iterator for the width (dimension 0) is reset to its start.
+     *
+     * @param[in] dimension Dimension to increment
+     */
+    void increment(size_t dimension);
+
+    /** Return the offset in bytes from the first element to the current position of the iterator
+     *
+     * @return The current position of the iterator in bytes relative to the first element.
+     */
+    constexpr int offset() const;
+
+    /** Return a pointer to the current pixel.
+     *
+     * @warning Only works if the iterator was created with an ITensor.
+     *
+     * @return equivalent to  buffer() + offset()
+     */
+    constexpr uint8_t *ptr() const;
+
+    /** Move the iterator back to the beginning of the specified dimension.
+     *
+     * @param[in] dimension Dimension to reset
+     */
+    void reset(size_t dimension);
+
+private:
+    uint8_t *_ptr;
+
+    class Dimension
+    {
+    public:
+        constexpr Dimension()
+            : _dim_start(0), _stride(0)
+        {
+        }
+
+        int _dim_start;
+        int _stride;
+    };
+
+    std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+};
+
+/** Iterate through the passed window, automatically adjusting the iterators and calling the lambda_functino for each element.
+ *  It passes the x and y positions to the lambda_function for each iteration
+ *
+ * @param[in]     w               Window to iterate through.
+ * @param[in]     lambda_function The function of type void(function)( const Coordinates & id ) to call at each iteration.
+ *                                Where id represents the absolute coordinates of the item to process.
+ * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
+ */
+template <typename L, typename... Ts>
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
+
+/** Update window and padding size for each of the access patterns.
+ *
+ * First the window size is reduced based on all access patterns that are not
+ * allowed to modify the padding of the underlying tensor. Then the padding of
+ * the remaining tensors is increased to match the window.
+ *
+ * @param[in] win      Window that is used by the kernel.
+ * @param[in] patterns Access patterns used to calculate the final window and padding.
+ *
+ * @return True if the window has been changed. Changes to the padding do not
+ *         influence the returned value.
+ */
+template <typename... Ts>
+bool update_window_and_padding(Window &win, Ts &&... patterns)
+{
+    bool window_changed = false;
+
+    for_each([&](const IAccessWindow & w)
+    {
+        window_changed |= w.update_window_if_needed(win);
+    },
+    patterns...);
+
+    bool padding_changed = false;
+
+    for_each([&](const IAccessWindow & w)
+    {
+        padding_changed |= w.update_padding_if_needed(win);
+    },
+    patterns...);
+
+    return window_changed;
+}
+
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size. The border region will be excluded from the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] border_size (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+
+/** Intersect multiple valid regions.
+ *
+ * @param[in] regions Valid regions.
+ *
+ * @return Intersection of all regions.
+ */
+template <typename... Ts>
+ValidRegion intersect_valid_regions(Ts &&... regions)
+{
+    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    {
+        ValidRegion region;
+
+        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        {
+            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
+        }
+
+        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        {
+            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
+        }
+
+        return region;
+    };
+
+    return foldl(intersect, std::forward<Ts>(regions)...);
+}
+
+/** Create a strides object based on the provided strides and the tensor dimensions.
+ *
+ * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
+ * @param[in] stride_x      Stride to be used in X dimension (in bytes).
+ * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
+ *
+ * @return Strides object based on the specified strides. Missing strides are
+ *         calculated based on the tensor shape and the strides of lower dimensions.
+ */
+template <typename T, typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+{
+    const TensorShape &shape = info.tensor_shape();
+
+    // Create strides object
+    Strides strides(stride_x, fixed_strides...);
+
+    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    {
+        strides.set(i, shape[i - 1] * strides[i - 1]);
+    }
+
+    return strides;
+}
+
+/** Create a strides object based on the tensor dimensions.
+ *
+ * @param[in] info Tensor info object used to compute the strides.
+ *
+ * @return Strides object based on element size and tensor shape.
+ */
+template <typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info)
+{
+    return compute_strides(info, info.element_size());
+}
+
+/* Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty.
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     shape                New shape.
+ * @param[in]     num_channels         New number of channels.
+ * @param[in]     data_type            New data type
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the tensor info has been initialized
+ */
+bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position);
+
+/* Set the shape to the specified value if the current assignment is empty.
+ *
+ * @param[in,out] info  Tensor info used to check and assign.
+ * @param[in]     shape New shape.
+ *
+ * @return True if the shape has been changed.
+ */
+bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
+
+/* Set the format, data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info   Tensor info used to check and assign.
+ * @param[in]     format New format.
+ *
+ * @return True if the format has been changed.
+ */
+bool set_format_if_unknown(ITensorInfo &info, Format format);
+
+/* Set the data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info      Tensor info used to check and assign.
+ * @param[in]     data_type New data type.
+ *
+ * @return True if the data type has been changed.
+ */
+bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
+
+/* Set the fixed point position to the specified value if
+ * the current fixed point position is 0 and the data type is QS8 or QS16
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the fixed point position has been changed.
+ */
+bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position);
+} // namespace arm_compute
+
+#include "arm_compute/core/Helpers.inl"
+#endif /*__ARM_COMPUTE_HELPERS_H__ */
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
new file mode 100644
index 0000000000..f885810078
--- /dev/null
+++ b/arm_compute/core/Helpers.inl
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <numeric>
+
+namespace arm_compute
+{
+inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = *pixel_ptr;
+    const float a01 = *(pixel_ptr + 1);
+    const float a10 = *(pixel_ptr + stride);
+    const float a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
+
+    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+}
+
+inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = x;
+    const int32_t yi = y;
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1u8(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1u8(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
+}
+
+inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    // Calculate sampling position
+    float in_x = (x + 0.5f) * wr - 0.5f;
+    float in_y = (y + 0.5f) * hr - 0.5f;
+
+    // Get bounding box offsets
+    int x_from = std::floor(x * wr - 0.5f - in_x);
+    int y_from = std::floor(y * hr - 0.5f - in_y);
+    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
+    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
+
+    // Clamp position to borders
+    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
+    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
+
+    // Clamp bounding box offsets to borders
+    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
+    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
+    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
+    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
+
+    // Get pixel index
+    const int xi = std::floor(in_x);
+    const int yi = std::floor(in_y);
+
+    // Bounding box elements in each dimension
+    const int x_elements = (x_to - x_from + 1);
+    const int y_elements = (y_to - y_from + 1);
+    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
+
+    // Sum pixels in area
+    int sum = 0;
+    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    {
+        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
+        sum                = std::accumulate(ptr, ptr + x_elements, sum);
+    }
+
+    // Return average
+    return sum / (x_elements * y_elements);
+}
+
+template <size_t dimension>
+struct IncrementIterators
+{
+    template <typename T, typename... Ts>
+    static void unroll(T &&it, Ts &&... iterators)
+    {
+        it.increment(dimension);
+        IncrementIterators<dimension>::unroll<Ts...>(std::forward<Ts>(iterators)...);
+    }
+
+    template <typename T>
+    static void unroll(T &&it)
+    {
+        it.increment(dimension);
+        // End of recursion
+    }
+
+    static void unroll()
+    {
+        // End of recursion
+    }
+};
+
+template <size_t dim>
+struct ForEachDimension
+{
+    template <typename L, typename... Ts>
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    {
+        const auto &d = w[dim - 1];
+
+        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        {
+            id.set(dim - 1, v);
+            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+        }
+    }
+};
+
+template <>
+struct ForEachDimension<0>
+{
+    template <typename L, typename... Ts>
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    {
+        lambda_function(id);
+    }
+};
+
+template <typename L, typename... Ts>
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+{
+    w.validate();
+
+    Coordinates id;
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+}
+
+inline constexpr Iterator::Iterator()
+    : _ptr(nullptr), _dims()
+{
+}
+
+inline Iterator::Iterator(const ITensor *tensor, const Window &win)
+    : Iterator()
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    _ptr = tensor->buffer() + info->offset_first_element_in_bytes();
+
+    //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        _dims[n]._stride = win[n].step() * strides[n];
+        std::get<0>(_dims)._dim_start += strides[n] * win[n].start();
+    }
+
+    //Copy the starting point to all the dimensions:
+    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    {
+        _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
+    }
+
+    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, info->num_dimensions());
+}
+
+inline void Iterator::increment(const size_t dimension)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+
+    _dims[dimension]._dim_start += _dims[dimension]._stride;
+
+    for(unsigned int n = 0; n < dimension; ++n)
+    {
+        _dims[n]._dim_start = _dims[dimension]._dim_start;
+    }
+}
+
+inline constexpr int Iterator::offset() const
+{
+    return _dims.at(0)._dim_start;
+}
+
+inline constexpr uint8_t *Iterator::ptr() const
+{
+    return _ptr + _dims.at(0)._dim_start;
+}
+
+inline void Iterator::reset(const size_t dimension)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions - 1);
+
+    _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
+
+    for(unsigned int n = 0; n < dimension; ++n)
+    {
+        _dims[n]._dim_start = _dims[dimension]._dim_start;
+    }
+}
+
+inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_data_type(data_type);
+        info.set_tensor_shape(shape);
+        info.set_num_channels(num_channels);
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_tensor_shape(shape);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_format_if_unknown(ITensorInfo &info, Format format)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_format(format);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_data_type(data_type);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position)
+{
+    if(info.fixed_point_position() == 0 && (info.data_type() == DataType::QS8 || info.data_type() == DataType::QS16))
+    {
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
+}
+} // namespace arm_compute
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
new file mode 100644
index 0000000000..cf7490d53e
--- /dev/null
+++ b/arm_compute/core/IAccessWindow.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Decrease @p required in steps of @p step until it's less than @p available.
+ *
+ * @param[in] required  Number of required bytes.
+ * @param[in] available Number of available bytes.
+ * @param[in] step      Step size used to decrease required bytes.
+ *
+ * @return Largest value smaller than @p available that is a multiple of @p step
+ *
+ **/
+inline int adjust_down(int required, int available, int step)
+{
+    ARM_COMPUTE_ERROR_ON(step <= 0);
+
+    return required - step * ((required - available + step - 1) / step);
+}
+
+/** Increase @p required in steps of @p step until it's greater than @p available.
+ *
+ * @param[in] required  Number of required bytes.
+ * @param[in] available Number of available bytes.
+ * @param[in] step      Step size used to increase required bytes.
+ *
+ * @return Largest value smaller than @p available that is a multiple of @p step
+ *
+ **/
+inline int adjust_up(int required, int available, int step)
+{
+    ARM_COMPUTE_ERROR_ON(step <= 0);
+
+    return required + step * ((available - required + step - 1) / step);
+}
+
+/** Interface describing methods to update access window and padding based on kernel parameters. */
+class IAccessWindow
+{
+public:
+    virtual ~IAccessWindow() = default;
+    /** Shrink the window if padding is not large enough.
+     *
+     * @param[in] window Window used by the kernel.
+     *
+     * @return True if the window has been changed.
+     */
+    virtual bool update_window_if_needed(Window &window) const = 0;
+    /** Increase the padding to be large enough for the window.
+     *
+     * @param[in] window Window used by the kernel.
+     *
+     * @return True if the padding has been changed.
+     */
+    virtual bool update_padding_if_needed(const Window &window) const = 0;
+    /** Compute the valid region based on access pattern and valid region of the inputs.
+     *
+     * @note This method assumes that there is no border.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     * @param[in] border_undefined   Undefined borders are excluded from the valid region.
+     * @param[in] border_size        Size of the border around the XY-plane of the tensor.
+     */
+    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+};
+
+/** Implementation of a rectangular access pattern. */
+class AccessWindowRectangle : public IAccessWindow
+{
+public:
+    /** Constructor for a rectangular access pattern.
+     *
+     * @note Width and height have to be non-negative.
+     *
+     * @param[in,out] info   Tensor info of the accessed kernel.
+     * @param[in]     x      Offset of the access in X direction.
+     * @param[in]     y      Offset of the access in Y direction.
+     * @param[in]     width  Number of elements that are accessed in X direction.
+     * @param[in]     height Number of elements that are accessed in Y direction.
+     */
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height)
+        : AccessWindowRectangle(info, x, y, width, height, 1.f, 1.f)
+    {
+    }
+
+    /** Constructor for a rectangular access pattern.
+     *
+     * @note Width, height and scale have to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     x       Offset of the access in X direction.
+     * @param[in]     y       Offset of the access in Y direction.
+     * @param[in]     width   Number of elements that are accessed in X direction.
+     * @param[in]     height  Number of elements that are accessed in Y direction.
+     * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height, float scale_x, float scale_y)
+        : _info(info), _x(x), _y(y), _width(width), _height(height), _scale_x(scale_x), _scale_y(scale_y)
+    {
+        ARM_COMPUTE_ERROR_ON(width < 0);
+        ARM_COMPUTE_ERROR_ON(height < 0);
+        ARM_COMPUTE_ERROR_ON(scale_x < 0);
+        ARM_COMPUTE_ERROR_ON(scale_y < 0);
+    }
+
+    AccessWindowRectangle(const AccessWindowRectangle &) = delete;
+    AccessWindowRectangle &operator=(const AccessWindowRectangle &) = delete;
+    AccessWindowRectangle(AccessWindowRectangle &&)                 = default;
+    AccessWindowRectangle &operator=(AccessWindowRectangle &&) = default;
+    ~AccessWindowRectangle()                                   = default;
+
+    /** Set the valid region based on access pattern, valid region of the inputs and border mode.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
+     * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
+     */
+    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+
+    /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
+     *
+     * @note This method assumes that there is no border.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    ValidRegion compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const;
+
+    // Inherited methods overridden:
+
+    /** @note This method assumes that all elements written by the kernel are valid. */
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+
+protected:
+    ITensorInfo *_info;
+    int          _x;
+    int          _y;
+    int          _width;
+    int          _height;
+    float        _scale_x;
+    float        _scale_y;
+};
+
+/** Implementation of a column access pattern. */
+class AccessWindowVertical : public AccessWindowRectangle
+{
+public:
+    /** Constructor for a column access pattern.
+     *
+     * @note Height has to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     y       Offset of the access in Y direction.
+     * @param[in]     height  Number of elements that are accessed in Y direction.
+     * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowVertical(ITensorInfo *info, int y, int height, float scale_y = 1.f)
+        : AccessWindowRectangle(info, 0, y, 1, height, 1.f, scale_y)
+    {
+        ARM_COMPUTE_ERROR_ON(height < 0);
+        ARM_COMPUTE_ERROR_ON(scale_y < 0);
+    }
+};
+
+/** Implementation of a row access pattern. */
+class AccessWindowHorizontal : public AccessWindowRectangle
+{
+public:
+    /** Constructor for a row access pattern.
+     *
+     * @note Width has to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     x       Offset of the access in X direction.
+     * @param[in]     width   Number of elements that are accessed in X direction.
+     * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowHorizontal(ITensorInfo *info, int x, int width, float scale_x = 1.f)
+        : AccessWindowRectangle(info, x, 0, width, 1, scale_x, 1.f)
+    {
+        ARM_COMPUTE_ERROR_ON(width < 0);
+        ARM_COMPUTE_ERROR_ON(scale_x < 0);
+    }
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_H__*/
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
new file mode 100644
index 0000000000..2ed56100cf
--- /dev/null
+++ b/arm_compute/core/IArray.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IARRAY_H__
+#define __ARM_COMPUTE_IARRAY_H__
+
+#include "arm_compute/core/Error.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class KeyPoint;
+class Coordinates2D;
+class DetectionWindow;
+class Size2D;
+
+/** Array of type T */
+template <class T>
+class IArray
+{
+public:
+    /** Default constructor */
+    IArray()
+        : _num_values(0), _max_size(0) {};
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    IArray(size_t max_num_values)
+        : _num_values(0), _max_size(max_num_values)
+    {
+    }
+    /** Maximum number of values which can be stored in this array
+     *
+     * @return Maximum number of values
+     */
+    size_t max_num_values() const
+    {
+        return _max_size;
+    }
+    /** Default virtual destructor */
+    virtual ~IArray() = default;
+    /** Number of values currently stored in the array
+     *
+     * @return Number of values currently stored in the array or max_num_values + 1 if the array is overflowed.
+     */
+    size_t num_values() const
+    {
+        return _num_values;
+    }
+    /** Append the passed argument to the end of the array if there is room.
+     *
+     * @param[in] val Value to add to the array.
+     *
+     * @return True if the point was successfully added to the array. False if the array is full and the point couldn't be added.
+     */
+    bool push_back(const T &val)
+    {
+        ARM_COMPUTE_ERROR_ON(0 == _max_size);
+        if(_num_values >= max_num_values())
+        {
+            _num_values = max_num_values() + 1;
+            return false;
+        }
+        at(_num_values) = val;
+        _num_values++;
+        return true;
+    }
+    /** Clear all the points from the array. */
+    void clear()
+    {
+        _num_values = 0;
+    }
+    /** Did we lose some values because the array is too small?
+     *
+     * @return True if we tried to add a value using push_back() but there wasn't any room left to store it.
+     * False if all the values were successfully added to the array.
+     */
+    bool overflow() const
+    {
+        return _num_values > max_num_values();
+    }
+    /** Pointer to the first element of the array
+     *
+     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_poins().
+     *
+     * @return A pointer to the first element of the array
+     */
+    virtual T *buffer() const = 0;
+    /** Reference to the element of the array located at the given index
+     *
+     * @param[in] index Index of the element
+     *
+     * @return A reference to the element of the array located at the given index.
+     */
+    virtual T &at(size_t index) const
+    {
+        ARM_COMPUTE_ERROR_ON(buffer() == nullptr);
+        ARM_COMPUTE_ERROR_ON(index >= max_num_values());
+        return buffer()[index];
+    }
+    /** Resizes the array to contain "num" elements. If "num" is smaller than the maximum array size, the content is reduced to its first "num" elements,
+     *  "num" elements can't be bigger than the maximum number of values which can be stored in this array.
+     *
+     *  @param[in] num The new array size in number of elements
+     */
+    void resize(size_t num)
+    {
+        ARM_COMPUTE_ERROR_ON(num > max_num_values());
+        _num_values = num;
+    };
+
+private:
+    size_t _num_values;
+    size_t _max_size;
+};
+using IKeyPointArray        = IArray<KeyPoint>;
+using ICoordinates2DArray   = IArray<Coordinates2D>;
+using IDetectionWindowArray = IArray<DetectionWindow>;
+using ISize2DArray          = IArray<Size2D>;
+using IUInt8Array           = IArray<uint8_t>;
+using IUInt16Array          = IArray<uint16_t>;
+using IUInt32Array          = IArray<uint32_t>;
+using IInt16Array           = IArray<int16_t>;
+using IInt32Array           = IArray<int32_t>;
+using IFloatArray           = IArray<float>;
+}
+#endif /* __ARM_COMPUTE_IARRAY_H__ */
diff --git a/arm_compute/core/IDistribution.h b/arm_compute/core/IDistribution.h
new file mode 100644
index 0000000000..b57543a3bf
--- /dev/null
+++ b/arm_compute/core/IDistribution.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IDISTRIBUTION_H__
+#define __ARM_COMPUTE_IDISTRIBUTION_H__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface for distribution objects */
+class IDistribution
+{
+public:
+    /** Default virtual destructor */
+    virtual ~IDistribution() = default;
+    /** Returns the dimensions of the distribution.
+     *
+     * @note  This is fixed to 1-dimensional distribution for now.
+     * @return Dimensions of the distribution.
+     */
+    virtual size_t dimensions() const = 0;
+    /** Returns the total size in bytes of the distribution.
+     *
+     * @return Total size of the distribution in bytes.
+     */
+    virtual size_t size() const = 0;
+    /** Returns a pointer to the start of the distribution.
+     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_bins()
+     *
+     * @return Pointer to the start of the distribution.
+     */
+    virtual uint32_t *buffer() const = 0;
+    /** Clears the distribution by setting every element to zero. */
+    void clear() const;
+};
+}
+#endif /* __ARM_COMPUTE_IDISTRIBUTION_H__ */
diff --git a/arm_compute/core/IDistribution1D.h b/arm_compute/core/IDistribution1D.h
new file mode 100644
index 0000000000..ca8bfc0a7d
--- /dev/null
+++ b/arm_compute/core/IDistribution1D.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_IDISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** 1D Distribution interface */
+class IDistribution1D : public IDistribution
+{
+public:
+    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    IDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Returns the number of bins that the distribution has.
+     *
+     * @return Number of bins of the distribution.
+     */
+    size_t num_bins() const;
+    /** Returns the offset of the distribution.
+     *
+     * @return Offset of the distribution.
+     */
+    int32_t offset() const;
+    /** Returns the range of the distribution.
+     *
+     * @return Range of the distribution.
+     */
+    uint32_t range() const;
+    /** Returns the window of the distribution, which is the range divided by the number of bins.
+     *
+     * @note If range is not divided by the number of bins then it is invalid.
+     *
+     * @return Window of the distribution.
+     */
+    uint32_t window() const;
+    /** Sets the range of the distribution.
+     *
+     * @param[in] range New range of the distribution to be set.
+     */
+    void set_range(uint32_t range);
+
+    // Inherited methods overridden:
+    size_t size() const override;
+    size_t dimensions() const override;
+
+private:
+    size_t   _num_bins; /**< Number of bins. */
+    int32_t  _offset;   /**< Offset, which indicate the start of the usable values. */
+    uint32_t _range;    /**< The total number of consecutive values of the distribution interval */
+};
+}
+#endif /* __ARM_COMPUTE_IDISTRIBUTION1D_H__ */
diff --git a/arm_compute/core/IHOG.h b/arm_compute/core/IHOG.h
new file mode 100644
index 0000000000..8bf713ae82
--- /dev/null
+++ b/arm_compute/core/IHOG.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IHOG_H__
+#define __ARM_COMPUTE_IHOG_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+class HOGInfo;
+/** Interface for HOG data-object */
+class IHOG
+{
+public:
+    /** Interface to be implemented by the child class to return the HOG's metadata
+     *
+     * @return A pointer to the HOG's metadata.
+     */
+    virtual const HOGInfo *info() const = 0;
+    /** Default virtual destructor */
+    virtual ~IHOG() = default;
+    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
+     *
+     * @note Other elements of the array can be accessed using descriptor()[idx] for idx=[0, descriptor_size() - 1]
+     *
+     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
+     */
+    virtual float *descriptor() const = 0;
+};
+}
+#endif /* __ARM_COMPUTE_IHOG_H__ */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
new file mode 100644
index 0000000000..4f3812b6da
--- /dev/null
+++ b/arm_compute/core/IKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IKERNEL_H__
+#define __ARM_COMPUTE_IKERNEL_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+/** Common information for all the kernels */
+class IKernel
+{
+public:
+    /** Constructor */
+    IKernel();
+    /** Destructor */
+    virtual ~IKernel() = default;
+    /** Indicates whether or not the kernel is parallelisable
+     *
+     * If the kernel is parallelisable then the window returned by window() can be split into sub-windows
+     * which can then be run in parallel.
+     *
+     * If the kernel is not parallelisable then only the window returned by window() can be passed to run()
+     *
+     * @return True if the kernel is parallelisable
+     */
+    virtual bool is_parallelisable() const;
+    /** The size of the border for that kernel
+     *
+     * @return The width in number of elements of the border.
+     */
+    virtual BorderSize border_size() const;
+    /** The maximum window the kernel can be executed on
+     *
+     * @return The maximum window the kernel can be executed on.
+     */
+    const Window &window() const;
+
+protected:
+    /** Configure the kernel's window
+     *
+     * @param[in] window The maximum window which will be returned by window()
+     */
+    void configure(const Window &window);
+
+private:
+    Window _window;
+};
+}
+#endif /*__ARM_COMPUTE_IKERNEL_H__ */
diff --git a/arm_compute/core/ILut.h b/arm_compute/core/ILut.h
new file mode 100644
index 0000000000..5223aea67a
--- /dev/null
+++ b/arm_compute/core/ILut.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ILUT_H__
+#define __ARM_COMPUTE_ILUT_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Lookup Table object interface. */
+class ILut
+{
+public:
+    /** Default virtual destructor */
+    virtual ~ILut() = default;
+    /** Returns the total number of elements in the LUT.
+     *
+     * @return Total number of elements.
+     */
+    virtual size_t num_elements() const = 0;
+    /** Indicates the offset that needs to be applied to the raw index before performing a lookup in the LUT.
+     *
+     * @return The normalization offset.
+     */
+    virtual uint32_t index_offset() const = 0;
+    /** Returns the total size in bytes of the LUT.
+     *
+     * @return Total size of the LUT in bytes.
+     */
+    virtual size_t size_in_bytes() const = 0;
+    /** Returns the type of the LUT.
+     *
+     * @return The type of the LUT.
+     */
+    virtual DataType type() const = 0;
+    /** Returns a pointer to the start of the LUT.
+     * Other elements of the LUT can be accessed using buffer()[idx] for 0 <= idx < num_elements().
+     *
+     * @return Pointer to the start of the lut.
+     */
+    virtual uint8_t *buffer() const = 0;
+    /** Clears the LUT by setting every element to zero. */
+    virtual void clear() = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ILUT_H__ */
diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
new file mode 100644
index 0000000000..e91da75398
--- /dev/null
+++ b/arm_compute/core/IMultiHOG.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IMULTIHOG_H__
+#define __ARM_COMPUTE_IMULTIHOG_H__
+
+#include "arm_compute/core/IHOG.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Interface for storing multiple HOG data-objects */
+class IMultiHOG
+{
+public:
+    /** Default destructor */
+    virtual ~IMultiHOG() = default;
+    /** The number of HOG models stored
+     *
+     * @return The number of HOG models stored
+     */
+    virtual size_t num_models() const = 0;
+    /** Return a pointer to the requested HOG model
+     *
+     *  @param[in] index The index of the wanted HOG model.
+     *
+     *  @return A pointer pointed to the HOG model
+     */
+    virtual IHOG *model(size_t index) = 0;
+    /** Return a const pointer to the requested HOG model
+     *
+     *  @param[in] index The index of the wanted HOG model.
+     *
+     *  @return A const pointer pointed to the HOG model
+     */
+    virtual const IHOG *model(size_t index) const = 0;
+};
+}
+
+#endif /* __ARM_COMPUTE_IMULTIHOG_H__ */
diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h
new file mode 100644
index 0000000000..6ed3c785ca
--- /dev/null
+++ b/arm_compute/core/IMultiImage.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IMULTIIMAGE_H__
+#define __ARM_COMPUTE_IMULTIIMAGE_H__
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+class MultiImageInfo;
+
+/** Interface for multi-planar images */
+class IMultiImage
+{
+public:
+    /** Destructor */
+    virtual ~IMultiImage() = default;
+    /** Interface to be implemented by the child class to return the multi-planar image's metadata
+     *
+     * @return A pointer to the image's metadata.
+     */
+    virtual const MultiImageInfo *info() const = 0;
+    /** Return a pointer to the requested plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A pointer pointed to the plane
+     */
+    virtual IImage *plane(unsigned int index) = 0;
+    /** Return a constant pointer to the requested plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A constant pointer pointed to the plane
+     */
+    virtual const IImage *plane(unsigned int index) const = 0;
+};
+}
+#endif /*__ARM_COMPUTE_IMULTIIMAGE_H__ */
diff --git a/arm_compute/core/IPyramid.h b/arm_compute/core/IPyramid.h
new file mode 100644
index 0000000000..e5d7011cf9
--- /dev/null
+++ b/arm_compute/core/IPyramid.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IPYRAMID_H__
+#define __ARM_COMPUTE_IPYRAMID_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Interface for pyramid data-object */
+class IPyramid
+{
+public:
+    /** Default virtual destructor */
+    virtual ~IPyramid() = default;
+    /** Interface to be implemented by the child class to return the Pyramid's metadata
+     *
+     * @return A pointer to the Pyramid's metadata.
+     */
+    virtual const PyramidInfo *info() const = 0;
+    /** Retrieves a level of the pyramid as a ITensor pointer
+     *
+     * @param[in] index The index of the level, such that index is less than levels.
+     *
+     *  @return An ITensor pointer
+     */
+    virtual ITensor *get_pyramid_level(size_t index) const = 0;
+};
+}
+
+#endif /* __ARM_COMPUTE_IPYRAMID_H__ */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
new file mode 100644
index 0000000000..202b50a0d8
--- /dev/null
+++ b/arm_compute/core/ITensor.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSOR_H__
+#define __ARM_COMPUTE_ITENSOR_H__
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class Coordinates;
+
+/** Interface for NEON tensor */
+class ITensor
+{
+public:
+    /** Interface to be implemented by the child class to return the tensor's metadata
+     *
+     * @return A pointer to the tensor's metadata.
+     */
+    virtual ITensorInfo *info() const = 0;
+    /** Interface to be implemented by the child class to return the tensor's metadata
+     *
+     * @return A pointer to the tensor's metadata.
+     */
+    virtual ITensorInfo *info() = 0;
+    /** Default virtual destructor */
+    virtual ~ITensor() = default;
+    /** Interface to be implemented by the child class to return a pointer to CPU memory
+     *
+     * @return A CPU pointer to the beginning of the image's allocation.
+     */
+    virtual uint8_t *buffer() const = 0;
+
+    /** Return a pointer to the element at the passed coordinates
+     *
+     * @param[in] id Coordinates of the element
+     *
+     * @return Pointer to the requested element
+     */
+    inline uint8_t *ptr_to_element(const Coordinates &id) const
+    {
+        return buffer() + info()->offset_element_in_bytes(id);
+    }
+
+    /** Copy the content of another tensor.
+     *
+     * @note The number of dimensions of the source tensor must be less or equal to those of the destination tensor.
+     *
+     * @note All dimensions of the destination tensor must be greater or equal to the source tensor ones.
+     *
+     * @note num_channels() and element_size() of both tensors must match.
+     *
+     * @param[in] src Source tensor to copy from.
+     */
+    void copy_from(const ITensor &src);
+
+    /** Print a tensor to a given stream using user defined formatting information
+     *
+     * @param s      Output stream
+     * @param io_fmt Format information
+     */
+    void print(std::ostream &s, IOFormatInfo io_fmt = IOFormatInfo()) const;
+};
+
+using IImage = ITensor;
+}
+#endif /*__ARM_COMPUTE_ITENSOR_H__ */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
new file mode 100644
index 0000000000..bb3ac6e35e
--- /dev/null
+++ b/arm_compute/core/ITensorInfo.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSORINFO_H__
+#define __ARM_COMPUTE_ITENSORINFO_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the tensor's metadata */
+class ITensorInfo
+{
+public:
+    /** Default virtual destructor */
+    virtual ~ITensorInfo() = default;
+    /** Set the data type to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] data_type The new data type.
+     */
+    virtual void set_data_type(DataType data_type) = 0;
+    /** Set the number of channels to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] num_channels New number of channels.
+     */
+    virtual void set_num_channels(int num_channels) = 0;
+    /** Set the format of an already initialized tensor.
+     *
+     * @note If the data type has already been configured (i.e. not UNKNOWN) it
+     * must match the new format. If data type hasn't been configured it will
+     * be based on the format.
+     *
+     * @param[in] format Single-plane format of the tensor.
+     */
+    virtual void set_format(Format format) = 0;
+    /** Set the shape of an already initialized tensor.
+     *
+     * @warning Changing the shape requires to recompute the strides and is
+     * therefore only possible if the tensor hasn't been allocated yet.
+     *
+     * @param[in] shape New tensor shape.
+     */
+    virtual void set_tensor_shape(TensorShape shape) = 0;
+    /** Set the fixed point position to the specified value
+     *
+     * @warning The fixed point position must be set once the data type has been configured
+     *
+     * @param[in] fixed_point_position The new fixed point position
+     */
+    virtual void set_fixed_point_position(int fixed_point_position) = 0;
+    /** Update the offset to the first element and the strides to automatically computed values.
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @return True if the strides or the offset to the first element have changed.
+     */
+    virtual bool auto_padding() = 0;
+    /** Update the offset to the first element, the strides and the total size.
+     *
+     * @note This function can only increase the offset, strides and total size.
+     *
+     * @param[in] padding Padding around the XY plane in number of elements.
+     *
+     * @return True if the strides, offset and total size have changed.
+     */
+    virtual bool extend_padding(const PaddingSize &padding) = 0;
+    /** Return the size of the requested dimension
+     *
+     * @param[in] index Index of the dimension
+     *
+     * @return Dimension of the requested dimension
+     */
+    virtual size_t dimension(size_t index) const = 0;
+    /** The strides in bytes for accessing each dimension of the tensor
+     *
+     * @return Strides in bytes for each tensor dimension
+     */
+    virtual const Strides &strides_in_bytes() const = 0;
+    /** The offset from the beginning of the memory allocation to the first element of the tensor.
+     *  This can be used to access efficiently elements in a 2D tensor
+     *
+     * @return The offset in bytes to access the first element of the tensor.
+     */
+    virtual size_t offset_first_element_in_bytes() const = 0;
+    /** The offset in bytes from the beginning of the memory allocation to access the element at position (x, y, z ...)
+     *
+     * @param[in] pos Vector with the coordinates of the element to access.
+     *                The size of this vector must be equal to the number of dimensions of the tensor
+     *
+     * @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...)
+     */
+    virtual size_t offset_element_in_bytes(const Coordinates &pos) const = 0;
+    /** Fixed point position used when the tensor data type is QS8 or QS16
+     *
+     * @return The fixed point position that expresses the number of bits for the fractional part of the number
+     */
+    virtual int fixed_point_position() const = 0;
+    /** Element size in bytes calculated as data_size() * num_channels()
+     *
+     * @return The size of one element in bytes
+     */
+    virtual size_t element_size() const = 0;
+    /** The number of dimensions of the tensor (rank)
+     *
+     * @return The number of dimensions of the tensor (rank)
+     */
+    virtual size_t num_dimensions() const = 0;
+    /** The number of channels for each tensor element
+     *
+     * @return The number of channels for each tensor element
+     */
+    virtual size_t num_channels() const = 0;
+    /** Size for each dimension of the tensor
+     *
+     * @return A vector with the size for each dimension of the tensor
+     */
+    virtual const TensorShape &tensor_shape() const = 0;
+    /** Data type used for each element of the tensor
+     *
+     * @return Tensor data type
+     */
+    virtual DataType data_type() const = 0;
+    /** Colour format of the image
+     *
+     * @return Colour format of the image
+     */
+    virtual Format format() const = 0;
+    /** Returns the total size of the tensor in bytes.
+     *
+     * @return Total size of the tensor in bytes.
+     */
+    virtual size_t total_size() const = 0;
+    /** Padding of tensor.
+     *
+     * @return Padding.
+     */
+    virtual PaddingSize padding() const = 0;
+    /** Checks if the tensor has been allocated with padding or not.
+     *
+     * @return True if padding is allocated in the tensor, otherwise false.
+     */
+    virtual bool has_padding() const = 0;
+    /** Flag indicating whether the size of the tensor can be changed.
+     *
+     * @return True if the tensor size can be changed.
+     */
+    virtual bool is_resizable() const = 0;
+    /** Set the flag whether the tensor size can be changed.
+     *
+     * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
+     */
+    virtual void set_is_resizable(bool is_resizable) = 0;
+    /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
+     *
+     * @return The valid region.
+     */
+    virtual ValidRegion valid_region() const = 0;
+    /** Set the valid region of the tensor.
+     *
+     * @param[in] valid_region Valid region to set.
+     */
+    virtual void set_valid_region(ValidRegion valid_region) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/MultiImageInfo.h b/arm_compute/core/MultiImageInfo.h
new file mode 100644
index 0000000000..6d76953845
--- /dev/null
+++ b/arm_compute/core/MultiImageInfo.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIIMAGEINFO_H__
+#define __ARM_COMPUTE_MULTIIMAGEINFO_H__
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Store the multi-planar image's metadata */
+class MultiImageInfo
+{
+public:
+    /** Constructor */
+    MultiImageInfo();
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] width  Width of the image (in number of pixels)
+     * @param[in] height Height of the image (in number of pixels)
+     * @param[in] format Colour format of the image.
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Colour format of the image
+     *
+     * @return Colour format of the image
+     */
+    Format format() const;
+    /** Width in pixels
+     *
+     * @return The width in pixels
+     */
+    unsigned int width() const;
+    /** Height in pixels
+     *
+     * @return The height in pixels
+     */
+    unsigned int height() const;
+
+protected:
+    unsigned int _width;
+    unsigned int _height;
+    Format       _format;
+};
+}
+#endif /*__ARM_COMPUTE_MULTIIMAGEINFO_H__ */
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
new file mode 100644
index 0000000000..3ac8164a51
--- /dev/null
+++ b/arm_compute/core/NEON/INEKernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INEKERNEL_H__
+#define __ARM_COMPUTE_INEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+using INEKernel = ICPPKernel;
+}
+#endif /*__ARM_COMPUTE_INEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
new file mode 100644
index 0000000000..ca25532ef1
--- /dev/null
+++ b/arm_compute/core/NEON/INESimpleKernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INESIMPLEKERNEL_H__
+#define __ARM_COMPUTE_INESIMPLEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+namespace arm_compute
+{
+using INESimpleKernel = ICPPSimpleKernel;
+}
+#endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
new file mode 100644
index 0000000000..9be7c8a658
--- /dev/null
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/Utils.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+constexpr float red_coef_bt709    = 1.5748F;
+constexpr float green_coef_bt709  = -0.1873f;
+constexpr float green_coef2_bt709 = -0.4681f;
+constexpr float blue_coef_bt709   = 1.8556f;
+
+constexpr float rgb2yuv_bt709_kr = 0.2126f;
+constexpr float rgb2yuv_bt709_kb = 0.0722f;
+// K_g = 1 - K_r - K_b
+constexpr float rgb2yuv_bt709_kg = 0.7152f;
+// C_u = 1 / (2 * (1 - K_b))
+constexpr float rgb2yuv_bt709_cu = 0.5389f;
+// C_v = 1 / (2 * (1 - K_r))
+constexpr float rgb2yuv_bt709_cv = 0.6350f;
+
+inline void convert_uint8x16_to_float32x4x4(const uint8x16_t &in, float32x4x4_t &out)
+{
+    const auto tmp1 = vmovl_u8(vget_low_u8(in));
+    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+    const auto tmp2 = vmovl_u8(vget_high_u8(in));
+    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
+    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
+}
+
+inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
+{
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+}
+
+inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
+{
+    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
+                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
+                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+}
+
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
+                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+    const auto c128 = vdupq_n_f32(128.f);
+
+    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
+    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
+    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
+    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
+
+    // U = (B - Y) / (2 * (1 - K_b))
+    uvec = vsubq_f32(bvec, yvec);
+    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
+
+    // V = (R - Y) / (2 * (1 - K_r))
+    vvec = vsubq_f32(rvec, yvec);
+    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
+}
+
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
+                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+{
+    float32x4x3_t rgb1, rgb2;
+
+    // Compute: cb - 128 and cr - 128;
+    const auto c128 = vdupq_n_f32(128.f);
+    uvec_val        = vsubq_f32(uvec_val, c128);
+    vvec_val        = vsubq_f32(vvec_val, c128);
+
+    // Compute:
+    // r = 0.0000f*f_u + 1.5748f*f_v;
+    // g = 0.1873f*f_u - 0.4681f*f_v;
+    // b = 1.8556f*f_u + 0.0000f*f_v;
+    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
+    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
+                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+
+    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
+    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
+    // and written back to memory using vst3 instruction
+
+    rgb1.val[0] = vaddq_f32(yvec_val, red);
+    rgb1.val[1] = vaddq_f32(yvec_val, green);
+    rgb1.val[2] = vaddq_f32(yvec_val, blue);
+
+    rgb2.val[0] = vaddq_f32(yyvec_val, red);
+    rgb2.val[1] = vaddq_f32(yyvec_val, green);
+    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
+
+    uint8x8x3_t u8_rgb;
+    convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
+
+    if(!alpha)
+    {
+        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
+        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
+        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
+        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
+        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
+        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
+        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
+        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
+    }
+    else
+    {
+        uint8x8x4_t u8_rgba;
+        u8_rgba.val[0] = u8_rgb.val[0];
+        u8_rgba.val[1] = u8_rgb.val[1];
+        u8_rgba.val[2] = u8_rgb.val[2];
+        u8_rgba.val[3] = vdup_n_u8(255);
+        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
+        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
+        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
+        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
+        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
+        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
+        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
+        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
+    }
+}
+
+inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
+{
+    uint8x16x3_t rgb;
+
+    if(alpha)
+    {
+        const auto tmp = vld4q_u8(ptr);
+        rgb.val[0]     = tmp.val[0];
+        rgb.val[1]     = tmp.val[1];
+        rgb.val[2]     = tmp.val[2];
+    }
+    else
+    {
+        rgb = vld3q_u8(ptr);
+    }
+
+    return rgb;
+}
+
+inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec_top, fgvec_top, fbvec_top;
+    convert_uint8x16_to_float32x4x4(vec_top.val[0], frvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[1], fgvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[2], fbvec_top);
+
+    float32x4x4_t frvec_bottom, fgvec_bottom, fbvec_bottom;
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[0], frvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[1], fgvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[2], fbvec_bottom);
+
+    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
+    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
+
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
+                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
+                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+    }
+
+    convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]);
+    convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]);
+}
+
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_uv)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
+    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
+    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
+    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
+
+    uint8x8x2_t uvvec;
+    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
+    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
+
+    vst2_u8(out_uv, uvvec);
+}
+
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
+    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
+    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
+                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+
+    vst1_u8(out_u, vget_low_u8(uvvec));
+    vst1_u8(out_v, vget_high_u8(uvvec));
+}
+
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+                              unsigned char *const __restrict out_y,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec, fgvec, fbvec;
+    convert_uint8x16_to_float32x4x4(rvec, frvec);
+    convert_uint8x16_to_float32x4x4(gvec, fgvec);
+    convert_uint8x16_to_float32x4x4(bvec, fbvec);
+
+    float32x4x4_t fyvec, fuvec, fvvec;
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
+                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+    }
+
+    uint8x16_t yvec, uvec, vvec;
+    convert_float32x4x4_to_unit8x16(fyvec, yvec);
+    convert_float32x4x4_to_unit8x16(fuvec, uvec);
+    convert_float32x4x4_to_unit8x16(fvvec, vvec);
+
+    vst1q_u8(out_y, yvec);
+    vst1q_u8(out_u, uvec);
+    vst1q_u8(out_v, vvec);
+}
+}
+
+namespace arm_compute
+{
+void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld3q_u8(in.ptr());
+        uint8x16x4_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        ta2.val[3] = vdupq_n_u8(255);
+        vst4q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld4q_u8(in.ptr());
+        uint8x16x3_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        vst3q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+template <bool yuyv, bool alpha>
+void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    constexpr auto shift        = yuyv ? 0 : 1;
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        float32x4x4_t uvec, yvec, vvec, yyvec;
+        const auto    ta = vld4q_u8(in.ptr());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        convert_uint8x16_to_float32x4x4(ta.val[0 + shift], yvec);
+        convert_uint8x16_to_float32x4x4(ta.val[1 - shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta.val[2 + shift], yyvec);
+        convert_uint8x16_to_float32x4x4(ta.val[3 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+    },
+    in, out);
+}
+
+template <bool uv, bool alpha>
+void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+    constexpr auto shift        = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_uv, out);
+}
+
+template <bool alpha>
+void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u.val[0] = U0 U2 U4 U6 ...
+        //ta_v.val[0] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_u, uvec);
+        convert_uint8x16_to_float32x4x4(ta_v, vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_u, in_v, out);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // NV12's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16x2_t uvvec;
+        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst2q_u8(out_uv.ptr(), uvvec);
+    },
+    in, out_y, out_uv);
+}
+
+void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        uint8x16x2_t ta_uv;
+        ta_uv.val[0] = vld1q_u8(in_u.ptr());
+        ta_uv.val[1] = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst2q_u8(out_uv.ptr(), ta_uv);
+    },
+    in_y, in_u, in_v, out_y, out_uv);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // Destination's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16_t uvec;
+        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        vst1q_u8(out_u.ptr(), uvec);
+
+        uint8x16_t vvec;
+        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst1q_u8(out_v.ptr(), vvec);
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_uv.val[0 + shift];
+        uvec.val[1] = ta_uv.val[0 + shift];
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_uv.val[1 - shift];
+        vvec.val[1] = ta_uv.val[1 - shift];
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u = U0 U2 U4 U6 ...
+        //ta_v = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_u;
+        uvec.val[1] = ta_u;
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_v;
+        vvec.val[1] = ta_v;
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_u, in_v, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_uv.ptr());
+    },
+    in, out_y, out_uv);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb = load_rgb(in.ptr(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
+                          out_y.ptr(), out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+}
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
new file mode 100644
index 0000000000..fb712611cb
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
+#define __ARM_COMPUTE_NEFIXEDPOINT_H__
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qint8x8_t    = int8x8_t;    /**< 8 bit fixed point vector with 8 elements */
+using qint8x8x2_t  = int8x8x2_t;  /**< 8 bit fixed point vector with 16 elements */
+using qint8x8x3_t  = int8x8x3_t;  /**< 8 bit fixed point vector with 24 elements */
+using qint8x8x4_t  = int8x8x4_t;  /**< 8 bit fixed point vector with 32 elements */
+using qint8x16_t   = int8x16_t;   /**< 8 bit fixed point vector with 16 elements */
+using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
+using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
+using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
+using qint16x4_t   = int16x4_t;   /**< 16 bit fixed point vector with 4 elements */
+using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
+using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
+using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements */
+using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
+using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+
+/** Get the lower half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_low_qs8(qint8x16_t a);
+
+/** Get the higher half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_high_qs8(qint8x16_t a);
+
+/** Load a single 8 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_qs8(const qint8_t *addr);
+
+/** Load a single 8 bit fixed point vector from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_qs8(const qint8_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (4 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vld1_qs16(const qint16_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (8 elements)
+ */
+qint16x8_t vld1q_qs16(const qint16_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_dup_qs8(const qint8_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
+
+/** Store a single 8 bit fixed point vector to memory (8 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1_qs8(qint8_t *addr, qint8x8_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1q_qs8(qint8_t *addr, qint8x16_t b);
+
+/** Store a single 16 bit fixed point vector to memory (4 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1_qs16(qint16_t *addr, qint16x4_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1q_qs16(qint16_t *addr, qint16x8_t b);
+
+/** 16 bit fixed point vector saturating narrow (8 elements)
+ *
+ * @param[in] a 16 bit fixed point vector to convert
+ *
+ * @return 8 bit fixed point vector
+ */
+qint8x8_t vqmovn_q16(qint16x8_t a);
+
+/** 8 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x8_t vdup_n_qs8(qint8_t a);
+
+/** 8 bit fixed point vector duplicate (16 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8(qint8_t a);
+
+/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
+
+/** 16 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 16 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16(qint16x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vabs_qs8(qint8x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vabsq_qs8(qint8x16_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vqabs_qs8(qint8x8_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vqabsq_qs8(qint8x16_t a);
+
+/** 8 bit fixed point vector max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector max (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise max operation
+ */
+qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector min operation
+ */
+qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise min operation
+ */
+qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 16 bit fixed point vector saturating add (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
+
+/** 16 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
+
+/** 8 bit fixed point vector saturating pairwise add (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+int16x4_t vpaddl_qs8(qint8x8_t a);
+
+/** 8 bit fixed point vector subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector long multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point long vector multiplication.
+ */
+qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
+ */
+qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
+ */
+qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate long (8 elements).
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position);
+
+/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x2x4
+ */
+float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x4x4
+ */
+float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Division fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in fixed point format.
+ */
+qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
+
+/** Division fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in 8bit fixed point format.
+ */
+qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 8bit (16 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] b                    8bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit power.
+ */
+qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+}
+#include "arm_compute/core/NEON/NEFixedPoint.inl"
+#endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
new file mode 100644
index 0000000000..6db344dc11
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements */
+const std::array<qint8x8_t, 4> exp_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x7F), // 0.9978546
+        vdup_n_s8(0x3F), // 0.4994721
+        vdup_n_s8(0x16), // 0.1763723
+        vdup_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements */
+const std::array<qint8x16_t, 4> exp_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x7F), // 0.9978546
+        vdupq_n_s8(0x3F), // 0.4994721
+        vdupq_n_s8(0x16), // 0.1763723
+        vdupq_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x8_t, 4> log_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x5C),  // 1.4384189
+        vdup_n_s8(-0x56), // -0.6771900
+        vdup_n_s8(0x29),  // 0.3218538
+        vdup_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x16_t, 4> log_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x5C),  // 1.4384189
+        vdupq_n_s8(-0x56), // -0.6771900
+        vdupq_n_s8(0x29),  // 0.3218538
+        vdupq_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+inline qint8x8_t vget_low_qs8(qint8x16_t a)
+{
+    return vget_low_s8(a);
+}
+
+inline qint8x8_t vget_high_qs8(qint8x16_t a)
+{
+    return vget_high_s8(a);
+}
+
+inline qint8x8_t vld1_qs8(const qint8_t *addr)
+{
+    return vld1_s8(addr);
+}
+
+inline qint8x16_t vld1q_qs8(const qint8_t *addr)
+{
+    return vld1q_s8(addr);
+}
+
+inline qint16x4_t vld1_qs16(const qint16_t *addr)
+{
+    return vld1_s16(addr);
+}
+
+inline qint16x8_t vld1q_qs16(const qint16_t *addr)
+{
+    return vld1q_s16(addr);
+}
+
+inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
+{
+    return vld1_dup_s8(addr);
+}
+
+inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
+{
+    return vld1q_dup_s8(addr);
+}
+
+inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
+{
+    vst1_s8(addr, b);
+}
+
+inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
+{
+    vst1q_s8(addr, b);
+}
+
+inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
+{
+    vst1_s16(addr, b);
+}
+
+inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
+{
+    vst1q_s16(addr, b);
+}
+
+inline qint8x8_t vqmovn_qs16(qint16x8_t a)
+{
+    return vqmovn_s16(a);
+}
+
+inline qint8x8_t vdup_n_qs8(qint8_t a)
+{
+    return vdup_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8(qint8_t a)
+{
+    return vdupq_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
+{
+    float32x4x4_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vcvtq_qs8_f32(res, fixed_point_position);
+}
+
+inline qint16x8_t vdupq_n_qs16(qint16_t a)
+{
+    return vdupq_n_s16(a);
+}
+
+inline qint8x8_t vabs_qs8(qint8x8_t a)
+{
+    return vabs_s8(a);
+}
+
+inline qint8x16_t vabsq_qs8(qint8x16_t a)
+{
+    return vabsq_s8(a);
+}
+
+inline qint8x8_t vqabs_qs8(qint8x8_t a)
+{
+    return vqabs_s8(a);
+}
+
+inline qint8x16_t vqabsq_qs8(qint8x16_t a)
+{
+    return vqabsq_s8(a);
+}
+
+inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmax_s8(a, b);
+}
+
+inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vmaxq_s8(a, b);
+}
+
+inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmax_s8(a, b);
+}
+
+inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmin_s8(a, b);
+}
+
+inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vminq_s8(a, b);
+}
+
+inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmin_s8(a, b);
+}
+
+inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vadd_s8(a, b);
+}
+
+inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vaddq_s8(a, b);
+}
+
+inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqadd_s8(a, b);
+}
+
+inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqaddq_s8(a, b);
+}
+
+inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vqadd_s16(a, b);
+}
+
+inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vqaddq_s16(a, b);
+}
+
+inline int16x4_t vpaddl_qs8(qint8x8_t a)
+{
+    return vpaddl_s8(a);
+}
+
+inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vsub_s8(a, b);
+}
+
+inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vsubq_s8(a, b);
+}
+
+inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqsub_s8(a, b);
+}
+
+inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqsubq_s8(a, b);
+}
+
+inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vmovn_s16(res);
+}
+
+inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vshlq_s16(res0, fixed_point_position_s16);
+    res1 = vshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
+}
+
+inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vqshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vqmovn_s16(res);
+}
+
+inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vqshlq_s16(res0, fixed_point_position_s16);
+    res1 = vqshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
+}
+
+inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    qint16x8_t res = vmull_s8(a, b);
+
+    return vqrshlq_s16(res, fixed_point_position_s16);
+}
+
+inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vadd_s8(a, vmovn_s16(tmp));
+}
+
+inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
+}
+
+inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vqadd_s8(a, vqmovn_s16(tmp));
+}
+
+inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
+    return vqaddq_s8(a, res);
+}
+
+inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vaddq_s16(a, tmp);
+}
+
+inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vqaddq_s16(a, tmp);
+}
+
+inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+        }
+    };
+
+    const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
+
+    return vqmovn_s16(res_s16);
+}
+
+inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+    res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
+    res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+            vcvtq_s32_f32(res_f32.val[2]),
+            vcvtq_s32_f32(res_f32.val[3]),
+        }
+    };
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
+            vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
+        }
+    };
+
+    return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
+}
+
+inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8_t res_s16 = vmovl_s8(a);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16)),
+            vmovl_s16(vget_high_s16(res_s16))
+        }
+    };
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+
+    return res_f32;
+}
+
+inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vmovl_s8(vget_low_s8(a)),
+            vmovl_s8(vget_high_s8(a)),
+        }
+    };
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16.val[0])),
+            vmovl_s16(vget_high_s16(res_s16.val[0])),
+            vmovl_s16(vget_low_s16(res_s16.val[1])),
+            vmovl_s16(vget_high_s16(res_s16.val[1])),
+        }
+    };
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1]),
+            vcvtq_f32_s32(res_s32.val[2]),
+            vcvtq_f32_s32(res_s32.val[3])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+    res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
+    res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
+
+    return res_f32;
+}
+
+inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x8_t const_48_over_17       = vdup_n_s8(0x7A >> (5 - fixed_point_position));    // 2.823
+    const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x8_t const_one              = vdup_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    const qint8x8_t temp        = vshl_s8(a, shift_value);
+
+    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    uint8x8_t set_one = vcgt_s8(x, const_one);
+    x                 = vbsl_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshl_s8(x, shift_value);
+}
+
+inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vshlq_s8(a, shift_value);
+
+    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshlq_s8(x, shift_value);
+}
+
+inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vqshlq_s8(a, shift_value);
+
+    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshlq_s8(x, shift_value);
+}
+
+inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+template <bool   islog>
+inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool   islog>
+inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vqmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vqmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
+    const qint8x8_t const_one     = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_ln2     = vqrshl_s8(vdup_n_s8(0x58), shift_value);                     // ln(2)
+    const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
+
+    qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha           = vqabs_qs8(vqsub_s8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
+    poly           = vqadd_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshl_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
+    const qint8x16_t const_one     = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_ln2     = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);                      // ln(2)
+    const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
+
+    qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabsq_qs8(vqsubq_qs8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
+    poly            = vqaddq_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshlq_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_seven_dec = vdup_n_s8(7);
+    const qint8x8_t const_ln2       = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
+    qint8x8_t recip           = vdup_n_s8(0);
+    recip                     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Calculate reciprocal
+    recip = vrecip_qs8(recip, fixed_point_position);
+    a     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
+    qint8x8_t dec_a       = vshl_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x8_t shift_value_neg = vneg_s8(shift_value);
+    const qint8x8_t temp            = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
+    const qint8x8_t sum             = vmul_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_seven_dec = vdupq_n_s8(7);
+    const qint8x16_t const_ln2       = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
+    qint8x16_t recip           = vdupq_n_s8(0);
+    recip                      = vbslq_s8(calc_reciprocal, a, recip);
+
+    // Calculate reciprocal
+    recip = vrecipq_qs8(recip, fixed_point_position);
+    a     = vbslq_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
+    qint8x16_t dec_a       = vshlq_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
+    const qint8x16_t temp            = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
+    const qint8x16_t sum             = vmulq_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
+
+    qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x8_t num   = vqsub_qs8(exp2x, const_one);
+    qint8x8_t den   = vqadd_qs8(exp2x, const_one);
+    qint8x8_t tanh  = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
+
+    qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x16_t num   = vqsubq_qs8(exp2x, const_one);
+    qint8x16_t den   = vqaddq_qs8(exp2x, const_one);
+    qint8x16_t tanh  = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+}
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
new file mode 100644
index 0000000000..eaa50f123b
--- /dev/null
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEKERNELS_H__
+#define __ARM_COMPUTE_NEKERNELS_H__
+
+/* Header regrouping all the NEON kernels */
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
new file mode 100644
index 0000000000..bb8a330c1e
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMATH_H__
+#define __ARM_COMPUTE_NEMATH_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table.
+ *
+ * @return The calculated approximation.
+ */
+float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+}
+#include "arm_compute/core/NEON/NEMath.inl"
+#endif /* __ARM_COMPUTE_NEMATH_H__ */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
new file mode 100644
index 0000000000..a31a4c0dc5
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/* Exponent polynomial coefficients */
+const std::array<float32x4_t, 8> exp_tab =
+{
+    {
+        vdupq_n_f32(1.f),
+        vdupq_n_f32(0.0416598916054f),
+        vdupq_n_f32(0.500000596046f),
+        vdupq_n_f32(0.0014122662833f),
+        vdupq_n_f32(1.00000011921f),
+        vdupq_n_f32(0.00833693705499f),
+        vdupq_n_f32(0.166665703058f),
+        vdupq_n_f32(0.000195780929062f),
+    }
+};
+
+/* Logarithm polynomial coefficients */
+const std::array<float32x4_t, 8> log_tab =
+{
+    {
+        vdupq_n_f32(-2.29561495781f),
+        vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f),
+        vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),
+        vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),
+        vdupq_n_f32(0.0141278216615f),
+    }
+};
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
+{
+    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
+    static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
+    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
+    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+}
\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000000..9ef93ce67a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the absolute difference kernel
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class NEAbsoluteDifferenceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~NEAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16
+     * @param[out] output Destination tensor, Data types supported: U8/S16
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised absolute difference functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
+     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+
+    /** Absolute difference function to use for the particular tensor formats passed to configure() */
+    AbsDiffFunction *_func;
+    const ITensor   *_input1;
+    const ITensor   *_input2;
+    ITensor         *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
new file mode 100644
index 0000000000..df6d7b8891
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the accumulate kernel
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class NEAccumulateKernel : public INESimpleKernel
+{
+public:
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input Source tensor. Data type supported: U8.
+     * @param[out] accum Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+
+/** Interface for the accumulate weighted kernel
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class NEAccumulateWeightedKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateWeightedKernel();
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
+     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    float _alpha;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate weighted kernel using F16 */
+class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
+#endif
+
+/** Interface for the accumulate squared kernel
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class NEAccumulateSquaredKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateSquaredKernel();
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]
+     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    uint32_t _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
new file mode 100644
index 0000000000..97f92d6a1e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the activation layer kernel. */
+class NEActivationLayerKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
+    /** Default move constructor */
+    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor. Data types supported: QS8/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer information.
+     */
+    void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
+    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
+
+private:
+    ActivationFunctionExecutorPtr _func;
+    ActivationLayerInfo           _act_info;
+};
+}
+#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
new file mode 100644
index 0000000000..b36ca46e1a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEArithmeticAdditionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticAdditionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised add functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Add function to use for the particular tensor types passed to configure() */
+    AddFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
new file mode 100644
index 0000000000..0eb9c23686
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform subtraction between two tensors */
+class NEArithmeticSubtractionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised sub functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32)
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    SubFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000000..29fcbd26a0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch normalization layer kernel.
+ */
+class NEBatchNormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BatchNormFunction = void(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window);
+    BatchNormFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    const ITensor     *_mean;
+    const ITensor     *_var;
+    const ITensor     *_gamma;
+    const ITensor     *_beta;
+    float              _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
new file mode 100644
index 0000000000..b931445419
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class NEBitwiseAndKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
new file mode 100644
index 0000000000..e34eb0f5ae
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise NOT operation
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class NEBitwiseNotKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseNotKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  An input tensor. Data type supported: U8.
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input;  /**< Source tensor */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
new file mode 100644
index 0000000000..d2bae2660c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise inclusive OR between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class NEBitwiseOrKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
new file mode 100644
index 0000000000..9dea36e7e3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class NEBitwiseXorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
new file mode 100644
index 0000000000..6b7bebbf17
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+#define __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Box 3x3 filter */
+class NEBox3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform a Box 3x3 filter using F16 simd
+ */
+class NEBox3x3FP16Kernel : public NEBox3x3Kernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEBox3x3FP16Kernel = NEBox3x3Kernel;
+#endif
+}
+#endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
new file mode 100644
index 0000000000..b86085f439
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+#define __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Computes magnitude and quantised phase from inputs gradients. */
+class NEGradientKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel(const NEGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel(NEGradientKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel &operator=(NEGradientKernel &&) = default;
+    /** Default destructor */
+    virtual ~NEGradientKernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
+     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
+     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
+     */
+    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** Common signature for all the specialised gradient functions
+     *
+     * @param[in]  gx_ptr        Pointer to the first input tensor.
+     * @param[in]  gy_ptr        Pointer to the second input tensor.
+     * @param[out] magnitude_ptr Pointer to the first output tensor
+     * @param[out] phase_ptr     Pointer to the second output tensor
+     */
+    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
+
+    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
+    const ITensor    *_gx;        /**< Source tensor - Gx component */
+    const ITensor    *_gy;        /**< Source tensor - Gy component */
+    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
+    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Gradient computation
+ */
+class NEGradientFP16Kernel : public NEGradientKernel
+{
+public:
+    // Inherited methods overriden:
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
+};
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+using NEGradientFP16Kernel = NEGradientKernel;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
+ *
+ * @note Hysteresis is computed in @ref NEEdgeTraceKernel
+ */
+class NEEdgeNonMaxSuppressionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~NEEdgeNonMaxSuppressionKernel() = default;
+
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
+     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in]  upper_thr        Upper threshold used for the hysteresis
+     * @param[in]  lower_thr        Lower threshold used for the hysteresis
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Common signature for all the specialised non-maxima suppression functions
+     *
+     * @param[in]  magnitude_ptr Pointer to the first input tensor.
+     * @param[in]  phase_ptr     Pointer to the second input tensor.
+     * @param[out] output_ptr    Pointer to the output tensor
+     * @param[in]  stride_mag    Stride of the magnitude tensor
+     * @param[in]  upper_thr     Upper threshold used for the hysteresis
+     * @param[in]  lower_thr     Lower threshold used for the hysteresis
+     */
+    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
+                                         const int32_t lower_thr);
+
+    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
+    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
+    ITensor                 *_output;    /**< Destination tensor */
+    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
+    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
+};
+
+/** NEON kernel to perform Edge tracing */
+class NEEdgeTraceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
+    /** Default constructor */
+    ~NEEdgeTraceKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+
+private:
+    ITensor *_input;  /**< Source tensor */
+    ITensor *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
new file mode 100644
index 0000000000..8b669a4d28
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel combine kernel */
+class NEChannelCombineKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelCombineKernel() = default;
+
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Combine 3 planes to form a three channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_3C(const Window &win);
+    /** Combine 4 planes to form a four channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_4C(const Window &win);
+    /** Combine 3 planes to form a single plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    template <bool is_yuyv>
+    void combine_YUV_1p(const Window &win);
+    /** Combine 3 planes to form a two plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_2p(const Window &win);
+    /** Combine 3 planes to form a three plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_3p(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win, uint32_t plane_id);
+    /** Common signature for all the specialised ChannelCombine functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
+    /** ChannelCombine function to use for the particular tensor types passed to configure() */
+    ChannelCombineFunction _func;
+    std::array<const ITensor *, 4> _planes;
+    ITensor     *_output;
+    IMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+    unsigned int _num_elems_processed_per_iteration;
+    bool         _is_parallelizable;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
new file mode 100644
index 0000000000..0715e1f8cb
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel extract kernel */
+class NEChannelExtractKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelExtractKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Format supported: u8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar destination image. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Extract one channel from a two channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_2C_img(const Window &win);
+    /** Extract one channel from a three channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_3C_img(const Window &win);
+    /** Extract one channel from a four channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_4C_img(const Window &win);
+    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_YUYV_uv(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win);
+    /** Common signature for all the specialised ChannelExtract functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
+    /** ChannelExtract function to use for the particular tensor types passed to configure() */
+    ChannelExtractFunction _func;
+    unsigned int           _lut_index;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
new file mode 100644
index 0000000000..f6bc2152da
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOL2IMKERNEL_H__
+#define __ARM_COMPUTE_NECOL2IMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform col2im reshaping.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NECol2ImKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel(const NECol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel(NECol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
+    /** Default destructor */
+    ~NECol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the col2im
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_col2im(const Window &window);
+
+    /** Common signature for all the specialised col2im functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
+
+    Col2ImFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
new file mode 100644
index 0000000000..2297218117
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+#define __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the color convert kernel */
+class NEColorConvertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel(NEColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
+    /** Default destructor */
+    ~NEColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
+    const void           *_input;
+    void                 *_output;
+    ColorConvertFunction *_func;
+};
+}
+#endif /*__ARM_COMPUTE_NECOLORCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
new file mode 100644
index 0000000000..588a228a5d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class NEConvolutionKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    template <typename OutputType>
+    void convolution(const Window &win);
+
+protected:
+    uint32_t _scale;                                             /**< scale of the convolution */
+    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
+};
+
+/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
+using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
+/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
+using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
+/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
+using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
+///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
+using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionHorKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
+     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor..
+     *
+     * @param[in] window Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolve(const Window &window);
+
+    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
+    BorderSize _border_size;                    /**< Border size */
+};
+
+/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionVertKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as U16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_u16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S32.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s32(const Window &win);
+
+    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
+    uint32_t _scale;                            /**< Convolution's scale */
+};
+
+/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
+using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
+using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
+using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class NEConvolutionRectangleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int get_index(uint32_t val);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType, unsigned int rows, unsigned int cols>
+    void convolution(const Window &win);
+
+protected:
+    const ITensor            *_input;       /**< Input tensor */
+    ITensor                  *_output;      /**< Output tensor */
+    uint32_t                  _scale;       /**< Scale of the convolution */
+    std::vector<int16_t>      _convolution; /**< Convolution matrix */
+    BorderSize                _border_size; /**< Calculated border width */
+    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
+    const static unsigned int _nr_supported_sizes
+    {
+        4
+    }; /**< Number of supported permutations */
+};
+}
+#endif /*__ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
new file mode 100644
index 0000000000..67b8c6052d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ILut;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
+ *
+ * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
+ * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
+ * pixel intensities which is used for improve the constrast of the image.
+ */
+class NECumulativeDistributionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECumulativeDistributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
+    /** Set the input and output distribution.
+     *
+     * @param[in]  input          Input image. Data type supported: U8
+     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
+     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
+     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
+     */
+    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage          *_input;          /**< Input image. */
+    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
+    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
+    ILut                  *_output;         /**< Output with the equalization lookup table. */
+private:
+    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
+};
+}
+
+#endif /*__ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
new file mode 100644
index 0000000000..7384cd1f02
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEDepthConcatenateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int            _top_bottom;
+    int            _left_right;
+    unsigned int   _depth_offset;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
new file mode 100644
index 0000000000..0c5c29e4db
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+#define __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Depth conversion kernel */
+class NEDepthConvertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor*/
+    NEDepthConvertKernel();
+    /** Set the input and output of the kernel
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QS8 -> F32
+     *   - U8 -> U16, S16, S32
+     *   - U16 -> U8, U32
+     *   - S16 -> U8, S32
+     *   - F32 -> QS8
+     *
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/QS8/U16/S16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ConvertPolicy _policy;
+    uint32_t      _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
new file mode 100644
index 0000000000..abb8a894c0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+#define __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
+ *
+ */
+class NEDerivativeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel(NEDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform derivative along the X direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_x(const Window &window);
+    /** Function to perform derivative along the Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_y(const Window &window);
+    /** Function to perform derivative along the X and Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_xy(const Window &window);
+    /** Common signature for all the specialised derivative functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
+    /** Derivative function to use for the particular tensor types passed to configure() */
+    DerivativeFunction _func;
+
+private:
+    const ITensor *_input;    /**< Input tensor */
+    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
+    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
+};
+}
+#endif /* __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
new file mode 100644
index 0000000000..05f148a1fd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDilateKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDILATEKERNEL_H__
+#define __ARM_COMPUTE_NEDILATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image dilatation */
+class NEDilateKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEDILATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
new file mode 100644
index 0000000000..f098e18655
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to accumulate the biases to each element of the input tensor
+ *
+ * @note We assume bias to be shared
+ */
+class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerBiasAccumulateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerBiasAccumulateKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                        Data type supported: QS8/F32
+     * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                         Data type supported: Same as @p input
+     */
+    void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
+
+private:
+    BiasAccumulateKernel *_func;
+    ITensor              *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000000..d726071606
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON interface for Direct Convolution Layer kernel */
+class NEDirectConvolutionLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights and output tensors.
+      *
+      * @param[in]  input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]  weights   Set of kernels to convolve the input volume.
+      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                       Data type supported: Same as @p input.
+      * @param[out] output    Output tensor.
+      *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_weights;
+    ITensor       *_output;
+    PadStrideInfo  _conv_info;
+    BorderSize     _border_size;
+    unsigned int   _kernel_size;
+    unsigned int   _num_elems_read_per_iteration;
+    unsigned int   _num_elems_written_per_iteration;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
new file mode 100644
index 0000000000..86dc217cc0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEErodeKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEERODEKERNEL_H__
+#define __ARM_COMPUTE_NEERODEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image erosion */
+class NEErodeKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEERODEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
new file mode 100644
index 0000000000..d9bd6acde9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** NEON kernel to perform fast corners */
+class NEFastCornersKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel(NEFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data type supported: U8.
+     * @param[out] output              Output image. Data type supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IImage *_input;               /**< source image */
+    IImage       *_output;              /**< inermediate results */
+    uint8_t       _threshold;           /**< threshold on difference between intensity */
+    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
+};
+}
+#endif
diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
new file mode 100644
index 0000000000..8e0846ea88
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+#define __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
+class NEFillArrayKernel : public INEKernel
+{
+public:
+    /** Default contructor */
+    NEFillArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel(NEFillArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
+    /** Default detructor */
+    ~NEFillArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input     Source image. Data type supported: U8.
+     * @param[in]  threshold Texels greater than the threshold will be added to the array.
+     * @param[out] output    Arrays of keypoints to store the results.
+     */
+    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage   *_input;
+    IKeyPointArray *_output;
+    uint8_t         _threshold;
+};
+}
+#endif
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
new file mode 100644
index 0000000000..3ec66115e2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill borders */
+class NEFillBorderKernel : public INEKernel
+{
+public:
+    /** Default Constructor */
+    NEFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel(NEFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_replicate_single_channel(const Window &window);
+    template <typename T>
+    void fill_constant_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    BorderMode _mode;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLBORDERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
new file mode 100644
index 0000000000..61e6e46463
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill the interior borders */
+class NEFillInnerBorderKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEFillInnerBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillInnerBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000000..b9884ffb57
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to interleave the elements of a matrix
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class NEGEMMInterleave4x4Kernel : public INESimpleKernel
+{
+public:
+    /* Constructor */
+    NEGEMMInterleave4x4Kernel();
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+
+    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..ba4dcc3373
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply matrices
+ *
+ * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
+ *  This kernel performs the following computation:
+ *
+ *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+ *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+ *  -# Compute the int32 matrix product of the resulting a * b.
+ *  -# Add output_offset to each entry of the result.
+ *  -# Multiply each entry of the result and round to the nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ */
+class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
+     * kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data type supported: U8
+     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0
+     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+     * @param[in]  output_offset   Offset to be added to each element of the output matrix
+     * @param[in]  output_mult_int Value to be multipied to each entry of the result.
+     * @param[in]  shift           Number of bits to shift right the result.
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _output_offset;
+    int32_t        _output_mult_int;
+    int32_t        _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 0000000000..c0ecafcd39
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
+     */
+    void configure(ITensor *accum, const ITensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ITensor       *_accum;
+    const ITensor *_biases;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000000..1ab52fa2f2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
+ *
+ * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
+ *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
+ *        - MTX_1 = C
+ */
+class NEGEMMMatrixAdditionKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ITensor *input, ITensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: QS8/F16/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    MatrixAdditionFunction *_func;
+    float                   _beta;
+};
+}
+#endif /* __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..a684945828
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ */
+class NEGEMMMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     *       These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  alpha  Weight of the matrix product
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    float          _alpha;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000000..5d8a3697cb
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class NEGEMMTranspose1xWKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
new file mode 100644
index 0000000000..763fab88f6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 3x3 filter */
+class NEGaussian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: S16
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
new file mode 100644
index 0000000000..86b28907da
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
+class NEGaussian5x5HorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussian5x5HorKernel();
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+};
+
+/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
+class NEGaussian5x5VertKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor, Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
new file mode 100644
index 0000000000..40a6aa7375
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
+class NEGaussianPyramidHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+    int        _l2_load_offset;
+};
+
+/** NEON kernel to perform a GaussianPyramid (vertical pass) */
+class NEGaussianPyramidVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
new file mode 100644
index 0000000000..dd85778b8a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG Orientation Binning */
+class NEHOGOrientationBinningKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
+     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
+     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
+     * @param[in]  mag_stride    Stride of the magnitude tensor
+     * @param[in]  phase_stride  Stride of the phase tensor
+     * @param[in]  cell_width    Width of the cell
+     * @param[in]  cell_height   Height of the cell
+     * @param[in]  num_bins      Number of bins for each cell
+     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
+     */
+    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                               size_t cell_height, size_t num_bins, float phase_scale);
+    /** Orientation binning function to use for the particular cell width passed to configure() */
+    OrientBinFunc *_func;
+    const ITensor *_input_magnitude;
+    const ITensor *_input_phase;
+    ITensor       *_output;
+    size_t         _cell_width;
+    size_t         _cell_height;
+    size_t         _num_bins;
+    float          _phase_scale;
+};
+
+/** NEON kernel to perform HOG block normalization */
+class NEHOGBlockNormalizationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
+     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
+     * @param[in]  input_stride               Stride of the input hog space tensor
+     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
+     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
+     * @param[in]  num_bins_block             Number of total bins per block
+     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
+     */
+    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                               float l2_hyst_threshold);
+    /** Block normalization function to use for the particular normalization type passed to configure() */
+    BlockNormFunc *_func;
+    const ITensor *_input;
+    ITensor       *_output;
+    Size2D         _num_cells_per_block;
+    Size2D         _num_cells_per_block_stride;
+    size_t         _num_bins;
+    float          _l2_hyst_threshold;
+};
+}
+#endif /* __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
new file mode 100644
index 0000000000..e56d1e5fd8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG detector kernel using linear SVM */
+class NEHOGDetectorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor         *_input;
+    IDetectionWindowArray *_detection_windows;
+    const float           *_hog_descriptor;
+    float                  _bias;
+    float                  _threshold;
+    uint16_t               _idx_class;
+    size_t                 _num_bins_per_descriptor_x;
+    size_t                 _num_blocks_per_descriptor_y;
+    size_t                 _block_stride_width;
+    size_t                 _block_stride_height;
+    size_t                 _detection_window_width;
+    size_t                 _detection_window_height;
+    size_t                 _max_num_detection_windows;
+    std::mutex             _mutex;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
new file mode 100644
index 0000000000..0abd73ef97
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Common interface for all Harris Score kernels */
+class INEHarrisScoreKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~INEHarrisScoreKernel() = default;
+
+public:
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
+     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
+     * @param[out] output           Destination image (harris score). Data types supported: F32
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
+
+protected:
+    const IImage *_input1;          /**< Source image - Gx component */
+    const IImage *_input2;          /**< Source image - Gy component */
+    IImage       *_output;          /**< Source image - Harris score */
+    float         _sensitivity;     /**< Sensitivity value */
+    float         _strength_thresh; /**< Threshold value */
+    float         _norm_factor;     /**< Normalization factor */
+    BorderSize    _border_size;     /**< Border size */
+};
+
+/** Template NEON kernel to perform Harris Score.
+ *  The implementation supports 3, 5, and 7 for the block_size
+ */
+template <int32_t block_size>
+class NEHarrisScoreKernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreKernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised harris score functions */
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate Weighted kernel using F16 */
+template <int32_t block_size>
+class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreFP16Kernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+#else
+template <int32_t block_size>
+using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
new file mode 100644
index 0000000000..c4dbbeae83
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+#define __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the histogram kernel */
+class NEHistogramKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHistogramKernel();
+    /** Default destructor */
+    ~NEHistogramKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel(const NEHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel(NEHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel &operator=(NEHistogramKernel &&) = default;
+
+    /** Set the input image and the distribution output.
+     *
+     * @param[in]     input      Source image. Data type supported: U8.
+     * @param[out]    output     Destination distribution.
+     * @param[in,out] local_hist Array that the threads use to save their local histograms.
+     *                           It's size should be equal to (number_of_threads * num_bins),
+     *                           and the Window::thread_id() is used to determine the part of the array
+     *                           used by each thread.
+     * @param[out]    window_lut LUT with pre-calculated possible window values.
+     *                           The size of the LUT should be equal to max_range_size and it will be filled
+     *                           during the configure stage, while it re-used in every run, therefore can be
+     *                           safely shared among threads.
+     */
+    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
+    /** Set the input image and the distribution output.
+     *
+     * @note Used for histogram of fixed size equal to 256
+     *
+     * @param[in]  input  Source image. Data type supported: U8.
+     * @param[out] output Destination distribution which must be of 256 bins..
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to merge multiple partial histograms.
+     *
+     *  @param[out] global_hist Pointer to the final histogram.
+     *  @param[in]  local_hist  Pointer to the partial histograms.
+     *  @param[in]  bins        Number of bins.
+     */
+    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
+    /** Function to merge multiple minimum values of partial histograms.
+     *
+     *  @param[out] global_min Pointer to the global min value.
+     *  @param[in]  local_min  Local min value.
+     */
+    void merge_min(uint8_t *global_min, const uint8_t &local_min);
+    /** Function to perform histogram on the given window
+      *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_U8(Window win);
+    /** Function to perform histogram on the given window where histogram is
+     *         of fixed size 256 without ranges and offsets.
+     *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_fixed_U8(Window win);
+    /** Pre-calculate the pixel windowing for every possible pixel
+     *
+     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
+     *
+     * @note We currently support U8 image thus possible pixel values are between 0 and 255
+     */
+    void calculate_window_lut() const;
+    /** Common signature for all the specialised Histogram functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
+
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
+    const IImage                 *_input;
+    IDistribution1D              *_output;
+    uint32_t                     *_local_hist;
+    uint32_t                     *_window_lut;
+    std::mutex                    _hist_mtx;
+    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
+};
+}
+#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
new file mode 100644
index 0000000000..ebaafb467f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEIM2COLKERNEL_H__
+#define __ARM_COMPUTE_NEIM2COLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEIm2ColKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel(NEIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
+    /** Default destructor */
+    ~NEIm2ColKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32
+     * @param[out] output         The output tensor. Data types supported: Same as @p input
+     * @param[in]  convolved_dims The convolved output dimensions.
+     * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias       In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the im2col optimised for the fully connected layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_reduced(const Window &window);
+    /** Template function to run the im2col used for the convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool has_pads>
+    void run_generic(const Window &window);
+    /** Common signature for all the specialised im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
+
+    Im2ColFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo _conv_info;
+    unsigned int  _kernel_size;
+    bool          _has_bias;
+};
+}
+#endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
new file mode 100644
index 0000000000..13647889ab
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+#define __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform an image integral on an image */
+class NEIntegralImageKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8
+     * @param[out] output Destination tensor. Data type supported: U32
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
new file mode 100644
index 0000000000..9ab7f91092
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LKTRACKERKERNEL_H__
+#define __ARM_COMPUTE_LKTRACKERKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Internal keypoint class for Lucas-Kanade Optical Flow */
+struct NELKInternalKeypoint
+{
+    float x{ 0.f };                 /**< x coordinate of the keypoint */
+    float y{ 0.f };                 /**< y coordinate of the keypoint */
+    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
+};
+
+using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
+
+/** Interface for the Lucas-Kanade tracker kernel */
+class NELKTrackerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELKTrackerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel(NELKTrackerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
+    /** Default destructor */
+    ~NELKTrackerKernel() = default;
+
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
+     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
+     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
+     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
+     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
+     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
+     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
+     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
+     * @param[in]      termination          The criteria to terminate the search of each keypoint.
+     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]      epsilon              The error for terminating the algorithm
+     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
+     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
+     * @param[in]      level                The pyramid level
+     * @param[in]      num_levels           The number of pyramid levels
+     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Initialise the array of keypoints in the provide range
+     *
+     * @param[in] start Index of first element in the keypoints array to be initialised
+     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
+     */
+    void init_keypoints(int start, int end);
+    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
+     *
+     * @param[in]  keypoint    Keypoint for which gradients are computed
+     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
+     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
+     *
+     * @return Values A11, A12, A22
+     */
+    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy);
+    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
+     *
+     * @param[in] old_keypoint Old keypoint for which gradient is computed
+     * @param[in] new_keypoint New keypoint for which gradient is computed
+     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
+     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
+     *
+     * @return Values b1, b2
+     */
+    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy);
+
+    const ITensor              *_input_old;
+    const ITensor              *_input_new;
+    const ITensor              *_old_scharr_gx;
+    const ITensor              *_old_scharr_gy;
+    IKeyPointArray             *_new_points;
+    const IKeyPointArray       *_new_points_estimates;
+    const IKeyPointArray       *_old_points;
+    INELKInternalKeypointArray *_old_points_internal;
+    INELKInternalKeypointArray *_new_points_internal;
+    Termination                 _termination;
+    bool                        _use_initial_estimate;
+    float                       _pyramid_scale;
+    float                       _epsilon;
+    unsigned int                _num_iterations;
+    int                         _window_dimension;
+    unsigned int                _level;
+    unsigned int                _num_levels;
+    ValidRegion                 _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_NELKTRACKERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..d4bff661f9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
+class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
new file mode 100644
index 0000000000..5d49901dd0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+#define __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseKernel();
+    /** Destructor */
+    ~NEMagnitudePhaseKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+private:
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseFP16Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseFP16Kernel();
+    /** Destructor */
+    ~NEMagnitudePhaseFP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseFP16Kernel(NEMagnitudePhaseFP16Kernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel &operator=(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseFP16Kernel &operator=(NEMagnitudePhaseFP16Kernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseFP16Kernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+#else
+template <MagnitudeType mag_type, PhaseType phase_type>
+using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
new file mode 100644
index 0000000000..83407ccb7d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+#define __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class NEMeanStdDevKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = default;
+    /** Default destructor */
+    ~NEMeanStdDevKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data type supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values.
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     */
+    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IImage *_input;
+    float        *_mean;
+    float        *_stddev;
+    uint64_t     *_global_sum;
+    uint64_t     *_global_sum_squared;
+    std::mutex    _mtx;
+};
+}
+#endif /* __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
new file mode 100644
index 0000000000..dee1aadfb9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform a median filter on a tensor */
+class NEMedian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
new file mode 100644
index 0000000000..e405ea5ae4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+#define __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to perform min max search on an image. */
+class NEMinMaxKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel(NEMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input Input Image. Data types supported: U8/S16.
+     * @param[out] min   Minimum value of image.
+     * @param[out] max   Maximum value of image.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max);
+    /** Resets global minimum and maximum. */
+    void reset();
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Performs the min/max algorithm on U8 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_U8(const Window &win);
+    /** Performs the min/max algorithm on S16 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_S16(const Window &win);
+    /** Common signature for all the specialised MinMax functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxFunction = void (NEMinMaxKernel::*)(const Window &window);
+    /** MinMax function to use for the particular image types passed to configure() */
+    MinMaxFunction _func;
+    /** Helper to update min/max values **/
+    template <typename T>
+    void update_min_max(T min, T max);
+
+    const IImage *_input;    /**< Input image. */
+    int32_t      *_min;      /**< Minimum value. */
+    int32_t      *_max;      /**< Maximum value. */
+    int32_t       _min_init; /**< Value to initialise global minimum value. */
+    int32_t       _max_init; /**< Value to initialise global maximum value. */
+    std::mutex    _mtx;      /**< Mutex used for result reduction. */
+};
+
+/** Interface for the kernel to find min max locations of an image. */
+class NEMinMaxLocationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxLocationKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input     Input Image. Data types supported: U8 or S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   Array of minimum value locations.
+     * @param[out] max_loc   Array of maximum value locations.
+     * @param[out] min_count Number of minimum value encounters.
+     * @param[out] max_count Number of maximum value encounters.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Performs the min/max location algorithm on T type images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+    void minmax_loc(const Window &win);
+    /** Common signature for all the specialised MinMaxLoc functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
+    /** MinMaxLoc function to use for the particular image types passed to configure() */
+    MinMaxLocFunction _func;
+    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
+    template <class T, typename>
+    struct create_func_table;
+
+    const IImage        *_input;                             /**< Input image. */
+    int32_t             *_min;                               /**< Minimum value. */
+    int32_t             *_max;                               /**< Maximum value. */
+    uint32_t            *_min_count;                         /**< Count of minimum value encounters. */
+    uint32_t            *_max_count;                         /**< Count of maximum value encounters. */
+    ICoordinates2DArray *_min_loc;                           /**< Locations of minimum values. */
+    ICoordinates2DArray *_max_loc;                           /**< Locations of maximum values. */
+    unsigned int         _num_elems_processed_per_iteration; /**< Elements processed per iteration. */
+};
+}
+#endif /*__ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
new file mode 100644
index 0000000000..ede0294a73
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+#define __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class NENonLinearFilterKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonLinearFilterKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Fill mask with the corresponding given pattern.
+     *
+     * @param[in,out] mask    Mask to be filled according to pattern
+     * @param[in]     cols    Columns (width) of mask
+     * @param[in]     rows    Rows (height) of mask
+     * @param[in]     pattern Pattern to fill the mask according to
+     */
+    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
+    /** Apply a median filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_box(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_box(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_box(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_cross(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_cross(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_cross(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_disk(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_disk(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_disk(const Window &win);
+    /** Apply a non-linear filter when given mask has user-defined pattern.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void non_linear_filter_generic(const Window &win);
+
+private:
+    unsigned int            _border_width;
+    const ITensor          *_input;
+    ITensor                *_output;
+    const uint8_t          *_mask;
+    MatrixPattern           _pattern;
+    NonLinearFilterFunction _function;
+    unsigned int            _func_idx;
+    BorderSize              _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000000..0daae59e54
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
+ *
+ * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
+ */
+class NENonMaximaSuppression3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonMaximaSuppression3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NENonMaximaSuppression3x3Kernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+protected:
+    /** Common signature for all the specialised non-maxima suppression 3x3 functions
+     *
+     * @param[in]  input_ptr    Pointer to the input tensor.
+     * @param[out] output_ptr   Pointer to the output tensor
+     * @param[in]  input_stride Stride of the input tensor
+     */
+    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
+
+    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor          *_input;  /**< Source tensor */
+    ITensor                *_output; /**< Destination tensor */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
+ */
+class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32.
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+};
+#else
+using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
+#endif
+}
+#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
new file mode 100644
index 0000000000..d4e36d5ff1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NENormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data type supported: same as @p input
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform normalization depending on the given template
+     *  dimension. The second template parameter specifies whether the
+     *  normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize(const Window &window);
+
+    /** Function to perform normalization for fixed-point values depending on
+     * the given template dimension. The second template parameter specifies
+     * whether the normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize_fixed_point(const Window &window);
+    /** Common signature for all the specialised normalization functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+
+private:
+    NormalizationFunction  _func;
+    const ITensor         *_input;
+    const ITensor         *_input_squared;
+    ITensor               *_output;
+    NormalizationLayerInfo _norm_info;
+    BorderSize             _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000000..7e402cd220
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEPixelWiseMultiplicationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Default destructor */
+    ~NEPixelWiseMultiplicationKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised multiplication functions with integer scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
+    /** Common signature for all the specialised multiplication functions with fixed-point values
+     *
+     * @param[in]  input1_ptr           Pointer to the first input tensor.
+     * @param[in]  input2_ptr           Pointer to the second input tensor.
+     * @param[in]  scale                Scaling factor.
+     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
+     * @param[out] output_ptr           Pointer to the output tensor.
+     */
+    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
+    /** Common signature for all the specialised multiplication functions with float scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
+
+    MulFunctionFloat *_func_float;
+    MulFunctionInt   *_func_int;
+    MulFunctionQInt *_func_q_int;
+
+private:
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+    float          _scale;
+    int            _scale_exponent;
+};
+}
+#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
new file mode 100644
index 0000000000..62a087841a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEPoolingLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QS8/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform 2x2 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_f32(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_q8(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_f32(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_q8(const Window &window_input, const Window &window);
+    /** Common signature for all the specialised Pooling functions
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window);
+
+private:
+    PoolingFunction  _func;
+    const ITensor   *_input;
+    ITensor         *_output;
+    PoolingLayerInfo _pool_info;
+    int              _num_elems_processed_per_iteration;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000000..f9eae68ee8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NERemapKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREMAPKERNEL_H__
+#define __ARM_COMPUTE_NEREMAPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+}
+#endif /*__ARM_COMPUTE_NEREMAPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
new file mode 100644
index 0000000000..03e26520b5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCALEKERNEL_H__
+#define __ARM_COMPUTE_NESCALEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform scaling on a tensor */
+class NEScaleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel(const NEScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel(NEScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel &operator=(NEScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEScaleKernel() = default;
+
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/S16.
+     * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  policy           Interpolation type to use
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform scale using nearest interpolation on the given window */
+    void scale_nearest(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    void scale_bilinear(const Window &window);
+    /** function to perform scale using area interpolation on the given window
+     *
+     *  @note Used only in case down-sampling.
+     */
+    void scale_area(const Window &window);
+    /** Scale function to use for the particular interpolation type passed to configure() */
+    void (NEScaleKernel::*_func)(const Window &window);
+
+    const ITensor *_offsets;
+    const ITensor *_dx;
+    const ITensor *_dy;
+    const ITensor *_input;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESCALEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
new file mode 100644
index 0000000000..c618456d49
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+#define __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+* @f[
+*      \mathbf{G}_x=\begin{vmatrix}
+*      -3 & 0 & +3\\
+*      -10& 0 & +10\\
+*      -3 & 0 & +3
+*      \end{vmatrix}
+* @f]
+*/
+class NEScharr3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEScharr3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ITensor *_input;        /**< Input tensor */
+    ITensor       *_output_x;     /**< Output tensor for scharr X */
+    ITensor       *_output_y;     /**< Output tensor for scharr Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESCHARR3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
new file mode 100644
index 0000000000..246dd83573
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -1 & 0 & +1\\
+ *      -2 & 0 & +2\\
+ *      -1 & 0 & +1
+ *      \end{vmatrix}
+ * @f]
+*/
+class NESobel3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NESobel3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< Output tensor for sobel X */
+    ITensor       *_output_y;    /**< Output tensor for sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
new file mode 100644
index 0000000000..49c1c41e6d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
+ *
+ */
+class NESobel5x5HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel5x5VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
+     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
+     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor *_output_x;    /**< X output of sobel */
+    ITensor *_output_y;    /**< Y output of sobel */
+    bool     _run_sobel_x; /**< Do we need to run sobel X? */
+    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL5x5KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
new file mode 100644
index 0000000000..4bff8596b8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
+ *
+ */
+class NESobel7x7HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel7x7VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     * @note If output_x is set then input_x must be set too
+     * @note If output_y is set then input_y must be set too
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor       *_output_x;    /**< X output of sobel */
+    ITensor       *_output_y;    /**< Y output of sobel */
+    bool           _run_sobel_x; /**< Do we need to run sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL7x7KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
new file mode 100644
index 0000000000..ab626ad5ec
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class NELogits1DMaxKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DMaxKernel();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window);
+
+private:
+    Logits1DMaxFunction *_func;
+    BorderSize           _border_size;
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class NELogits1DShiftExpSumKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel &operator=(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel(NELogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DShiftExpSumKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  max    Max values tensor. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window);
+
+private:
+    Logits1DShiftExpSumFunction *_func;
+    const ITensor               *_input;
+    const ITensor               *_max;
+    ITensor                     *_output;
+    ITensor                     *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class NELogits1DNormKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel(const NELogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel &operator=(const NELogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel(NELogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DNormKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *sum, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window);
+
+private:
+    Logits1DNormFunction *_func;
+    const ITensor        *_input;
+    const ITensor        *_sum;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
new file mode 100644
index 0000000000..b3963e5a75
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class NETableLookupKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NETableLookupKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel(const NETableLookupKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel(NETableLookupKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S16.
+     * @param[in]  lut    The input LUT.
+     * @param[out] output The output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Perform table lookup on a given window.
+     *
+     * @param window window Region on which to execute the kernel.
+     */
+    template <class T>
+    void tableLookup(const Window &window);
+    /** Common signature for all the specialised lut functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    TableLookupFunction _func;
+    const ILut         *_lut;
+};
+}
+#endif /* __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
new file mode 100644
index 0000000000..778176293f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+#define __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the thresholding kernel
+ *
+ */
+class NEThresholdKernel : public INEKernel
+{
+public:
+    /** Constructor
+     * Initialize all the pointers to nullptr and parameters to zero.
+     */
+    NEThresholdKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel(const NEThresholdKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
+    /** Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input       An input tensor. Data type supported: U8
+     * @param[out] output      The output tensor. Data type supported: U8.
+     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** run binary thresholding on the given window */
+    void run_binary(const Window &window);
+    /** run range thresholding on the given window */
+    void run_range(const Window &window);
+
+    void (NEThresholdKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input */
+    ITensor       *_output; /**< Output */
+    uint8_t        _threshold;
+    uint8_t        _false_value;
+    uint8_t        _true_value;
+    uint8_t        _upper;
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
new file mode 100644
index 0000000000..ac9449ff92
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class NETransposeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NETransposeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel(const NETransposeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel(NETransposeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel &operator=(NETransposeKernel &&) = default;
+    /** Default destructor */
+    ~NETransposeKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    /** Transpose function to use for the particular tensor types passed to configure() */
+    TransposeFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
new file mode 100644
index 0000000000..10fed1d450
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPKERNEL_H__
+#define __ARM_COMPUTE_NEWARPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for warp affine and warp perspective */
+class INEWarpKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEWarpKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel(const INEWarpKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel(INEWarpKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel &operator=(INEWarpKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] output                Destination tensor. Data type supported: U8.
+     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
+     * @param[in]  border_mode           Strategy to use for borders
+     * @param[in]  constant_border_value Constant value used for filling the border.
+     */
+    virtual void configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_undefined(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_constant(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_replicate(const Window &window) = 0;
+    /** Common signature for all the specialised warp functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    void (INEWarpKernel::*_func)(const Window &window);
+
+    const ITensor *_input;                 /**< Input Tensor */
+    ITensor       *_output;                /**< Output Tensor */
+    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
+    const float   *_matrix;                /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
+};
+
+/** Template interface for the kernel to compute warp affine
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpAffineKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+
+/** Template interface for the kernel to compute warp perspective
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpPerspectiveKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
new file mode 100644
index 0000000000..cad2d00b1f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEWeightsReshapeKernel : public INEKernel
+{
+public:
+    /** Constructor.*/
+    NEWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~NEWeightsReshapeKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F32
+     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+    WeightsReshapeKernel *_func;
+    const ITensor        *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
new file mode 100644
index 0000000000..b4912ce15a
--- /dev/null
+++ b/arm_compute/core/PixelValue.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PIXELVALUE_H__
+#define __ARM_COMPUTE_PIXELVALUE_H__
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Class describing the value of a pixel for any image format. */
+class PixelValue
+{
+public:
+    /** Default constructor: value initialized to 0 */
+    PixelValue()
+        : value{ { 0 } }
+    {
+    }
+    /** Initialize the union with a U8 pixel value
+     *
+     * @param[in] v U8 value.
+     */
+    PixelValue(uint8_t v)
+        : PixelValue()
+    {
+        value.u8 = v;
+    }
+    /** Initialize the union with a U16 pixel value
+     *
+     * @param[in] v U16 value.
+     */
+    PixelValue(uint16_t v)
+        : PixelValue()
+    {
+        value.u16 = v;
+    }
+    /** Initialize the union with a S16 pixel value
+     *
+     * @param[in] v S16 value.
+     */
+    PixelValue(int16_t v)
+        : PixelValue()
+    {
+        value.s16 = v;
+    }
+    /** Initialize the union with a U32 pixel value
+     *
+     * @param[in] v U32 value.
+     */
+    PixelValue(uint32_t v)
+        : PixelValue()
+    {
+        value.u32 = v;
+    }
+    /** Initialize the union with a S32 pixel value
+     *
+     * @param[in] v S32 value.
+     */
+    PixelValue(int32_t v)
+        : PixelValue()
+    {
+        value.s32 = v;
+    }
+    /** Initialize the union with a F32 pixel value
+     *
+     * @param[in] v F32 value.
+     */
+    PixelValue(float v)
+        : PixelValue()
+    {
+        value.f32 = v;
+    }
+    /** Union which describes the value of a pixel for any image format.
+     * Use the field corresponding to the image format
+     */
+    union
+        {
+            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+            float    f32;     /**< Single channel float 32 */
+            uint8_t  u8;      /**< Single channel U8 */
+            int8_t   s8;      /**< Single channel S8 */
+            uint16_t u16;     /**< Single channel U16 */
+            int16_t  s16;     /**< Single channel S16 */
+            uint32_t u32;     /**< Single channel U32 */
+            int32_t  s32;     /**< Single channel S32 */
+        } value;
+    /** Interpret the pixel value as a U8
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint8_t &v) const
+    {
+        v = value.u8;
+    }
+    /** Interpret the pixel value as a S8
+     *
+     * @param[out] v Returned value
+     */
+    void get(int8_t &v) const
+    {
+        v = value.s8;
+    }
+    /** Interpret the pixel value as a U16
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint16_t &v) const
+    {
+        v = value.u16;
+    }
+    /** Interpret the pixel value as a S16
+     *
+     * @param[out] v Returned value
+     */
+    void get(int16_t &v) const
+    {
+        v = value.s16;
+    }
+    /** Interpret the pixel value as a U32
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint32_t &v) const
+    {
+        v = value.u32;
+    }
+    /** Interpret the pixel value as a S32
+     *
+     * @param[out] v Returned value
+     */
+    void get(int32_t &v) const
+    {
+        v = value.s32;
+    }
+    /** Interpret the pixel value as a F32
+     *
+     * @param[out] v Returned value
+     */
+    void get(float &v) const
+    {
+        v = value.f32;
+    }
+};
+}
+#endif /* __ARM_COMPUTE_PIXELVALUE_H__ */
diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h
new file mode 100644
index 0000000000..76b3852bbf
--- /dev/null
+++ b/arm_compute/core/PyramidInfo.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PYRAMIDINFO_H__
+#define __ARM_COMPUTE_PYRAMIDINFO_H__
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the Pyramid's metadata */
+class PyramidInfo
+{
+public:
+    /** Default constructor */
+    PyramidInfo();
+    /** Default destructor */
+    virtual ~PyramidInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    PyramidInfo(const PyramidInfo &) = default;
+    /** Allow instances of this class to be copied */
+    PyramidInfo &operator=(const PyramidInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    PyramidInfo(PyramidInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    PyramidInfo &operator=(PyramidInfo &&) = default;
+
+    /** Create pyramid info for 2D tensors
+     *
+     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale      Used to indicate the scale between the pyramid levels.
+     *                       This is required to be a non-zero positive value.
+     * @param[in] width      The width of the 2D tensor at 0th pyramid level
+     * @param[in] height     The height of the 2D tensor at 0th pyramid level
+     * @param[in] format     The format of all 2D tensors in the pyramid
+     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
+     */
+    PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format);
+
+    /** Create pyramid info using TensorShape
+     *
+     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale        Used to indicate the scale between the pyramid levels.
+     *                         This is required to be a non-zero positive value.
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
+     * @param[in] format       The format of all tensors in the pyramid
+     */
+    PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
+
+    /** Initialize pyramid's metadata for 2D tensors
+     *
+     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale      Used to indicate the scale between the pyramid levels.
+     *                       This is required to be a non-zero positive value.
+     * @param[in] width      The width of the 2D tensor at 0th pyramid level
+     * @param[in] height     The height of the 2D tensor at 0th pyramid level
+     * @param[in] format     The format of all 2D tensors in the pyramid
+     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
+     */
+    void init(size_t num_levels, float scale, size_t width, size_t height, Format format);
+    /** Initialize pyramid's metadata using TensorShape
+     *
+     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale        Used to indicate the scale between the pyramid levels.
+     *                         This is required to be a non-zero positive value.
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
+     * @param[in] format       The format of all tensors in the pyramid
+     */
+    void init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
+    /** Return the number of the pyramid levels
+     *
+     *  @return The number of the pyramid levels
+     */
+    size_t num_levels() const;
+    /** Return the width of the 0th level tensor
+     *
+     *  @return The width of the 0th level tensor
+     */
+    size_t width() const;
+    /** Return the height of the 0th level tensor
+     *
+     *  @return The height of the 0th level tensor
+     */
+    size_t height() const;
+    /** Return the TensorShape of the o-th level tensor
+     *
+     * @return
+     */
+    const TensorShape &tensor_shape() const;
+    /** Return the image format of all tensor in the pyramid
+     *
+     *  @return The image format
+     */
+    Format format() const;
+    /** Return the scale factor of the pyramid
+     *
+     *  @return Return the scale factor
+     */
+    float scale() const;
+
+private:
+    size_t      _num_levels;
+    TensorShape _tensor_shape;
+    Format      _format;
+    float       _scale;
+};
+}
+#endif /*__ARM_COMPUTE_PYRAMIDINFO_H__ */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
new file mode 100644
index 0000000000..cb053ea2c4
--- /dev/null
+++ b/arm_compute/core/Size2D.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SIZE2D_H__
+#define __ARM_COMPUTE_SIZE2D_H__
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Class for specifying the size of an image or rectangle */
+class Size2D
+{
+public:
+    /** Default constructor */
+    Size2D()
+        : width(0), height(0)
+    {
+    }
+    /** Constructor. Initializes "width" and "height" respectively with "w" and "h"
+     *
+     * @param[in] w Width of the image or rectangle
+     * @param[in] h Height of the image or rectangle
+     */
+    Size2D(size_t w, size_t h)
+        : width(w), height(h)
+    {
+    }
+    /** Constructor. Initializes "width" and "height" with the dimensions of "size"
+     *
+     * @param[in] size Size data object
+     */
+    Size2D(const Size2D &size)
+        : width(size.width), height(size.height)
+    {
+    }
+    /** Copy assignment
+     *
+     * @param[in] size Constant reference input "Size2D" data object to copy
+     *
+     * @return Reference to the newly altered left hand side "Size2D" data object
+     */
+    Size2D &operator=(const Size2D &size)
+    {
+        width  = size.width;
+        height = size.height;
+        return *this;
+    }
+    /** The area of the image or rectangle calculated as (width * height)
+     *
+     * @return Area (width * height)
+     *
+     */
+    size_t area() const
+    {
+        return (width * height);
+    }
+
+public:
+    size_t width;  /**< Width of the image region or rectangle */
+    size_t height; /**< Height of the image region or rectangle */
+};
+}
+#endif /*__ARM_COMPUTE_SIZE2D_H__ */
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
new file mode 100644
index 0000000000..33a88a2568
--- /dev/null
+++ b/arm_compute/core/Steps.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_STEPS_H__
+#define __ARM_COMPUTE_STEPS_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Class to describe a number of elements in each dimension. Similar to @ref
+ *  Strides but not in bytes but number of elements.
+ */
+class Steps : public Dimensions<unsigned int>
+{
+public:
+    /** Constructor to initialize the steps.
+     *
+     * @param[in] steps Values to initialize the steps.
+     */
+    template <typename... Ts>
+    Steps(Ts... steps)
+        : Dimensions{ steps... }
+    {
+        // Initialize empty dimensions to 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Steps(const Steps &) = default;
+    /** Allow instances of this class to be copied */
+    Steps &operator=(const Steps &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Steps(Steps &&) = default;
+    /** Allow instances of this class to be moved */
+    Steps &operator=(Steps &&) = default;
+    /** Default destructor */
+    ~Steps() = default;
+};
+}
+#endif /*__ARM_COMPUTE_STEPS_H__*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
new file mode 100644
index 0000000000..329fafb5f8
--- /dev/null
+++ b/arm_compute/core/Strides.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_STRIDES_H__
+#define __ARM_COMPUTE_STRIDES_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Strides of an item in bytes */
+class Strides : public Dimensions<size_t>
+{
+public:
+    /** Constructor to initialize the strides.
+     *
+     * @param[in] strides Values to initialize the strides.
+     */
+    template <typename... Ts>
+    constexpr Strides(Ts... strides)
+        : Dimensions{ strides... }
+    {
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Strides(const Strides &) = default;
+    /** Allow instances of this class to be copied */
+    Strides &operator=(const Strides &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Strides(Strides &&) = default;
+    /** Allow instances of this class to be moved */
+    Strides &operator=(Strides &&) = default;
+    /** Default destructor */
+    ~Strides() = default;
+};
+}
+#endif /*__ARM_COMPUTE_STRIDES_H__*/
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
new file mode 100644
index 0000000000..e2532fd487
--- /dev/null
+++ b/arm_compute/core/SubTensorInfo.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSORINFO_H__
+#define __ARM_COMPUTE_SUBTENSORINFO_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the sub tensor's metadata */
+class SubTensorInfo final : public ITensorInfo
+{
+public:
+    /** Default constructor */
+    SubTensorInfo();
+    /** Default constructor
+     *
+     * @param[in] parent       Metadata of parent tensor.
+     * @param[in] tensor_shape Tensor shape. Shape must fit inside parent's shape.
+     *                         X and Y dimensions must match the parent's ones.
+     * @param[in] coords       Coordinates of starting element inside parent tensor.
+     */
+    SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Default destructor */
+    ~SubTensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    SubTensorInfo(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be copied */
+    SubTensorInfo &operator=(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    SubTensorInfo(SubTensorInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensorInfo &operator=(SubTensorInfo &&) = default;
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_data_type(data_type);
+    };
+    void set_num_channels(int num_channels) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_num_channels(num_channels);
+    };
+    void set_format(Format format) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_format(format);
+    };
+    void set_fixed_point_position(int fixed_point_position) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_fixed_point_position(fixed_point_position);
+    };
+    void set_tensor_shape(TensorShape shape) override;
+    bool auto_padding() override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->auto_padding();
+    };
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
+    {
+        return _tensor_shape[index];
+    }
+    const Strides &strides_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->strides_in_bytes();
+    }
+    size_t offset_first_element_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->offset_element_in_bytes(_coords);
+    }
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->fixed_point_position();
+    }
+    size_t element_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->element_size();
+    }
+    size_t num_dimensions() const override
+    {
+        return _tensor_shape.num_dimensions();
+    }
+    size_t num_channels() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->num_channels();
+    }
+    const TensorShape &tensor_shape() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _tensor_shape;
+    }
+    DataType data_type() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->data_type();
+    }
+    Format format() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->format();
+    }
+    size_t total_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->total_size();
+    }
+    PaddingSize padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->padding();
+    }
+    bool has_padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->has_padding();
+    }
+    bool is_resizable() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->is_resizable();
+    }
+    void set_is_resizable(bool is_resizable) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_is_resizable(is_resizable);
+    }
+    ValidRegion valid_region() const override
+    {
+        return _valid_region;
+    }
+    void set_valid_region(ValidRegion valid_region) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
+        _valid_region = std::move(valid_region);
+    }
+
+private:
+    ITensorInfo *_parent;
+    TensorShape  _tensor_shape;
+    Coordinates  _coords;
+    ValidRegion  _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSORINFO_H__ */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
new file mode 100644
index 0000000000..35b9ccb9ff
--- /dev/null
+++ b/arm_compute/core/TensorInfo.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORINFO_H__
+#define __ARM_COMPUTE_TENSORINFO_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+class HOGInfo;
+
+/** Store the tensor's metadata */
+class TensorInfo final : public ITensorInfo
+{
+public:
+    /** Default constructor */
+    TensorInfo();
+    /** Default destructor */
+    ~TensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    TensorInfo(const ITensorInfo &info);
+    /** Allow instances of this class to be copy constructed */
+    TensorInfo(const TensorInfo &) = default;
+    /** Allow instances of this class to be copied */
+    TensorInfo &operator=(const TensorInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    TensorInfo(TensorInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    TensorInfo &operator=(TensorInfo &&) = default;
+
+    /** Construct a tensor info with a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Format of the tensor.
+     */
+    TensorInfo(Format format);
+
+    /** 2D tensor constructor
+     *
+     * @param[in] width  Width of the 2D tensor
+     * @param[in] height Height of the 2D tensor
+     * @param[in] format Single plane format of the tensor.
+     */
+    TensorInfo(unsigned int width, unsigned int height, Format format);
+    /** Constructor
+     *
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] format       Single plane format of the tensor.
+     */
+    TensorInfo(const TensorShape &tensor_shape, Format format);
+
+    /** Construct a tensor info with a data type and number of channels.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) It specifies the fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
+    /** Constructor
+     *
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Constructor
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Single plane format of the tensor.
+     */
+    void init(Format format);
+
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape Size for each dimension of the tensor in number of elements.
+     * @param[in] format       Single plane format of the tensor.
+     */
+    void init(const TensorShape &tensor_shape, Format format);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
+     * @param[in] format                        Single plane format of the tensor.
+     * @param[in] strides_in_bytes              Stride in bytes for accessing each dimension of the tensor.
+     * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
+     * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
+     */
+    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    void init(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape         Size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels                  Desired number of channels for each tensor element.
+     * @param[in] data_type                     Data type to use for each tensor element.
+     * @param[in] strides_in_bytes              Stride in bytes for accessing each dimension of the tensor.
+     * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
+     * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
+     * @param[in] fixed_point_position          (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+              size_t total_size_in_bytes, int fixed_point_position = 0);
+    /** Initialize the metadata structure for the given HOG's metadata
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    void init(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements
+     * @param[in] format       Single plane format of the image.
+     *
+     * @return Total allocation size including padding in bytes.
+     */
+    size_t init_auto_padding(const TensorShape &tensor_shape, Format format);
+    /** Initialize the metadata structure for the given tensor shape, number of channels,
+     *  data type and fixed point position. (Padding is automatically calculated)
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     *
+     * @return Total allocation size including padding in bytes.
+     */
+    size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Initialize the metadata structure for the given HOG's metadata
+     *
+     * @note init_auto_padding will be used for the tensor initialization.
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override;
+    void set_num_channels(int num_channels) override;
+    void set_format(Format format) override;
+    void set_tensor_shape(TensorShape shape) override;
+    void set_fixed_point_position(int fixed_point_position) override;
+    bool auto_padding() override;
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
+    {
+        return _tensor_shape[index];
+    }
+    const Strides &strides_in_bytes() const override
+    {
+        return _strides_in_bytes;
+    }
+    size_t offset_first_element_in_bytes() const override
+    {
+        return _offset_first_element_in_bytes;
+    }
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
+    {
+        return _fixed_point_position;
+    }
+    size_t element_size() const override
+    {
+        return data_size_from_type(_data_type) * _num_channels;
+    }
+    size_t num_dimensions() const override
+    {
+        return _tensor_shape.num_dimensions();
+    }
+    size_t num_channels() const override
+    {
+        return _num_channels;
+    }
+    const TensorShape &tensor_shape() const override
+    {
+        return _tensor_shape;
+    }
+    DataType data_type() const override
+    {
+        return _data_type;
+    }
+    Format format() const override
+    {
+        return _format;
+    }
+    size_t total_size() const override
+    {
+        return _total_size;
+    }
+    PaddingSize padding() const override
+    {
+        return _padding;
+    }
+    bool has_padding() const override
+    {
+        return !_padding.empty();
+    }
+    bool is_resizable() const override
+    {
+        return _is_resizable;
+    }
+    void set_is_resizable(bool is_resizable) override
+    {
+        _is_resizable = is_resizable;
+    }
+    ValidRegion valid_region() const override
+    {
+        return _valid_region;
+    }
+    void set_valid_region(ValidRegion valid_region) override
+    {
+        _valid_region = std::move(valid_region);
+    }
+
+private:
+    /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
+     *
+     * @param[in] padding Padding around the XY plane in elements.
+     */
+    std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
+
+    size_t      _total_size;
+    int         _fixed_point_position;
+    size_t      _offset_first_element_in_bytes;
+    Strides     _strides_in_bytes;
+    size_t      _num_channels;
+    TensorShape _tensor_shape;
+    DataType    _data_type;
+    Format      _format;
+    bool        _is_resizable;
+    ValidRegion _valid_region;
+    PaddingSize _padding;
+};
+}
+#endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
new file mode 100644
index 0000000000..f8b3181686
--- /dev/null
+++ b/arm_compute/core/TensorShape.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORSHAPE_H__
+#define __ARM_COMPUTE_TENSORSHAPE_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <numeric>
+
+namespace arm_compute
+{
+/** Shape of a tensor */
+class TensorShape : public Dimensions<size_t>
+{
+public:
+    /** Constructor to initialize the tensor shape.
+     *
+     * @param[in] dims Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    TensorShape(Ts... dims)
+        : Dimensions{ dims... }
+    {
+        // Initialize unspecified dimensions to 1
+        if(_num_dimensions > 0)
+        {
+            std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+        }
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
+    }
+    /** Allow instances of this class to be copy constructed */
+    TensorShape(const TensorShape &) = default;
+    /** Allow instances of this class to be copied */
+    TensorShape &operator=(const TensorShape &) = default;
+    /** Allow instances of this class to be move constructed */
+    TensorShape(TensorShape &&) = default;
+    /** Allow instances of this class to be moved */
+    TensorShape &operator=(TensorShape &&) = default;
+    /** Default destructor */
+    ~TensorShape() = default;
+
+    /** Accessor to set the value of one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] value     Value to be set for the dimension.
+     */
+    void set(size_t dimension, size_t value)
+    {
+        ARM_COMPUTE_ERROR_ON(value < 1);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+
+        // Set the specified dimension and increase the number of dimensions if
+        // necessary
+        Dimensions::set(dimension, value);
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
+    }
+
+    /** Collapse the first n dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        Dimensions::collapse(n, first);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+    }
+
+    /** Collapses all dimensions to a single linear total size.
+     *
+     * @return The total tensor size in terms of elements.
+     */
+    size_t total_size() const
+    {
+        return std::accumulate(_id.begin(), _id.end(), 1, std::multiplies<size_t>());
+    }
+    /** Collapses given dimension and above.
+     *
+     * @note Precondition: dimension < TensorShape::num_max_dimensions
+     *
+     * @param[in] dimension Size of the wanted dimension
+     *
+     * @return The linear size of the collapsed dimensions
+     */
+    size_t total_size_upper(size_t dimension) const
+    {
+        return std::accumulate(_id.begin() + dimension, _id.end(), 1, std::multiplies<size_t>());
+    }
+
+private:
+    /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
+    void apply_dimension_correction()
+    {
+        for(int i = static_cast<int>(_num_dimensions) - 1; i >= 0; --i)
+        {
+            if(_id[i] == 1)
+            {
+                --_num_dimensions;
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+};
+}
+#endif /*__ARM_COMPUTE_TENSORSHAPE_H__*/
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
new file mode 100644
index 0000000000..725567b9ae
--- /dev/null
+++ b/arm_compute/core/Types.h
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TYPES_H__
+#define __ARM_COMPUTE_TYPES_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+/** Image colour formats */
+enum class Format
+{
+    UNKNOWN,  /** Unknown image format */
+    U8,       /** 1 channel, 1 U8 per channel */
+    S16,      /** 1 channel, 1 S16 per channel */
+    U16,      /** 1 channel, 1 U16 per channel */
+    S32,      /** 1 channel, 1 S32 per channel */
+    U32,      /** 1 channel, 1 U32 per channel */
+    F16,      /** 1 channel, 1 F16 per channel */
+    F32,      /** 1 channel, 1 F32 per channel */
+    UV88,     /** 2 channel, 1 U8 per channel */
+    RGB888,   /** 3 channels, 1 U8 per channel */
+    RGBA8888, /** 4 channels, 1 U8 per channel */
+    YUV444,   /** A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
+    YUYV422,  /** A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
+    NV12,     /** A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
+    NV21,     /** A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
+    IYUV,     /** A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
+    UYVY422   /** A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
+};
+
+/** Available data types */
+enum class DataType
+{
+    UNKNOWN,
+    U8,
+    S8,
+    QS8,
+    U16,
+    S16,
+    QS16,
+    U32,
+    S32,
+    U64,
+    S64,
+    F16,
+    F32,
+    F64,
+    SIZET
+};
+
+/** Constant value of the border pixels when using BorderMode::CONSTANT */
+constexpr uint8_t CONSTANT_BORDER_VALUE = 199;
+
+/* Constant value used to indicate a half-scale pyramid */
+constexpr float SCALE_PYRAMID_HALF = 0.5f;
+
+/* Constant value used to indicate a ORB scaled pyramid */
+constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
+
+struct ValidRegion
+{
+    ValidRegion()
+        : anchor{}, shape{}
+    {
+    }
+
+    ValidRegion(const ValidRegion &) = default;
+    ValidRegion(ValidRegion &&)      = default;
+    ValidRegion &operator=(const ValidRegion &) = default;
+    ValidRegion &operator=(ValidRegion &&) = default;
+    ~ValidRegion()                         = default;
+
+    ValidRegion(Coordinates anchor, TensorShape shape)
+        : anchor{ anchor }, shape{ shape }
+    {
+    }
+
+    /** Return the start of the valid region for the given dimension @p d */
+    int start(unsigned int d) const
+    {
+        return anchor[d];
+    }
+
+    /** Return the end of the valid region for the given dimension @p d */
+    int end(unsigned int d) const
+    {
+        return anchor[d] + shape[d];
+    }
+
+    Coordinates anchor;
+    TensorShape shape;
+};
+
+/** Methods available to handle borders */
+enum class BorderMode
+{
+    UNDEFINED, /**< Borders are left undefined */
+    CONSTANT,  /**< Pixels outside the image are assumed to have a constant value */
+    REPLICATE  /**< Pixels outside the image are assumed to have the same value as the closest image pixel */
+};
+
+/** Container for 2D border size */
+struct BorderSize
+{
+    /** Empty border, i.e. no border */
+    constexpr BorderSize()
+        : top{ 0 }, right{ 0 }, bottom{ 0 }, left{ 0 }
+    {
+    }
+
+    /** Border with equal size around the 2D plane */
+    constexpr BorderSize(unsigned int size)
+        : top{ size }, right{ size }, bottom{ size }, left{ size }
+    {
+    }
+
+    /** Border with same size for top/bottom and left/right */
+    constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
+        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+    {
+    }
+
+    /** Border with different sizes */
+    constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
+        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+    {
+    }
+
+    /** Check if the entire border is zero */
+    constexpr bool empty() const
+    {
+        return top == 0 && right == 0 && bottom == 0 && left == 0;
+    }
+
+    /** Check if the border is the same size on all sides */
+    constexpr bool uniform() const
+    {
+        return top == right && top == bottom && top == left;
+    }
+
+    BorderSize &operator*=(float scale)
+    {
+        top *= scale;
+        right *= scale;
+        bottom *= scale;
+        left *= scale;
+
+        return *this;
+    }
+
+    BorderSize operator*(float scale)
+    {
+        BorderSize size = *this;
+        size *= scale;
+
+        return size;
+    }
+
+    void limit(const BorderSize &limit)
+    {
+        top    = std::min(top, limit.top);
+        right  = std::min(right, limit.right);
+        bottom = std::min(bottom, limit.bottom);
+        left   = std::min(left, limit.left);
+    }
+
+    unsigned int top;
+    unsigned int right;
+    unsigned int bottom;
+    unsigned int left;
+};
+
+using PaddingSize = BorderSize;
+
+/** Policy to handle overflow */
+enum class ConvertPolicy
+{
+    WRAP,    /**< Wrap around */
+    SATURATE /**< Saturate */
+};
+
+/** Interpolation method */
+enum class InterpolationPolicy
+{
+    NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
+    BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
+    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+};
+
+/** Bilinear Interpolation method used by LKTracker */
+enum class BilinearInterpolation
+{
+    BILINEAR_OLD_NEW,
+    BILINEAR_SCHARR
+};
+
+/** Threshold mode */
+enum class ThresholdType
+{
+    BINARY, /**< Threshold with one value */
+    RANGE   /**< Threshold with two values*/
+};
+
+/** Rounding method */
+enum class RoundingPolicy
+{
+    TO_ZERO,        /**< Truncates the least significand values that are lost in operations. */
+    TO_NEAREST_UP,  /**< Rounds to nearest value; half rounds up */
+    TO_NEAREST_EVEN /**< Rounds to nearest value; half rounds to nearest even */
+};
+
+/** Termination criteria */
+enum class Termination
+{
+    TERM_CRITERIA_EPSILON,
+    TERM_CRITERIA_ITERATIONS,
+    TERM_CRITERIA_BOTH
+};
+
+/** Magnitude calculation type. */
+enum class MagnitudeType
+{
+    L1NORM, /**< L1 normalization type */
+    L2NORM  /**< L2 normalization type */
+};
+
+/** Phase calculation type.
+ *
+ * @note When PhaseType == SIGNED, each angle is mapped to the range 0 to 255 inclusive otherwise angles between 0 and 180
+ */
+enum class PhaseType
+{
+    SIGNED,  /**< Angle range: [0, 360] */
+    UNSIGNED /**< Angle range: [0, 180] */
+};
+
+/** Keypoint type */
+struct KeyPoint
+{
+    int32_t x{ 0 };               /**< X coordinates */
+    int32_t y{ 0 };               /**< Y coordinates */
+    float   strength{ 0.f };      /**< Strength of the point */
+    float   scale{ 0.f };         /**< Scale initialized to 0 by the corner detector */
+    float   orientation{ 0.f };   /**< Orientation initialized to 0 by the corner detector */
+    int32_t tracking_status{ 0 }; /**< Status initialized to 1 by the corner detector, set to 0 when the point is lost */
+    float   error{ 0.f };         /**< Tracking error initialized to 0 by the corner detector */
+};
+
+using InternalKeypoint = std::tuple<float, float, float>; /* x,y,strength */
+
+/** Rectangle type */
+struct Rectangle
+{
+    uint16_t x;      /**< Top-left x coordinate */
+    uint16_t y;      /**< Top-left y coordinate */
+    uint16_t width;  /**< Width of the rectangle */
+    uint16_t height; /**< Height of the rectangle */
+};
+
+/** Coordinate type */
+struct Coordinates2D
+{
+    int32_t x; /**< X coordinates */
+    int32_t y; /**< Y coordinates */
+};
+
+/** Coordinate type */
+struct Coordinates3D
+{
+    uint32_t x; /**< X coordinates */
+    uint32_t y; /**< Y coordinates */
+    uint32_t z; /**< Z coordinates */
+};
+
+/** Available channels */
+enum class Channel
+{
+    UNKNOWN, /** Unknown channel format */
+    C0,      /**< First channel (used by formats with unknown channel types). */
+    C1,      /**< Second channel (used by formats with unknown channel types). */
+    C2,      /**< Third channel (used by formats with unknown channel types). */
+    C3,      /**< Fourth channel (used by formats with unknown channel types). */
+    R,       /**< Red channel. */
+    G,       /**< Green channel. */
+    B,       /**< Blue channel. */
+    A,       /**< Alpha channel. */
+    Y,       /**< Luma channel. */
+    U,       /**< Cb/U channel. */
+    V        /**< Cr/V/Value channel. */
+};
+
+/** Available matrix patterns */
+enum class MatrixPattern
+{
+    BOX,   /**< Box pattern matrix. */
+    CROSS, /**< Cross pattern matrix. */
+    DISK,  /**< Disk pattern matrix. */
+    OTHER  /**< Any other matrix pattern. */
+};
+
+/** Available non linear functions. */
+enum class NonLinearFilterFunction : unsigned
+{
+    MEDIAN = 0, /**< Non linear median filter. */
+    MIN    = 1, /**< Non linear erode. */
+    MAX    = 2, /**< Non linear dilate. */
+};
+
+/** The normalization type used for the normalization layer */
+enum class NormType
+{
+    IN_MAP_1D, /**< Normalization applied within the same map in 1D region */
+    IN_MAP_2D, /**< Normalization applied within the same map in 2D region */
+    CROSS_MAP  /**< Normalization applied cross maps */
+};
+
+/** Normalization type for Histogram of Oriented Gradients (HOG) */
+enum class HOGNormType
+{
+    L2_NORM    = 1, /**< L2-norm */
+    L2HYS_NORM = 2, /**< L2-norm followed by clipping */
+    L1_NORM    = 3  /**< L1 norm */
+};
+
+/** Detection window used for the object detection. The detection window keeps the following information:
+ *
+ *  -# Geometry of the rectangular window (x/y of top-left corner and width/height)
+ *  -# Index of the class used for evaluating which class the detection window belongs to
+ *  -# Confidence value (score) obtained with the classifier
+ */
+struct DetectionWindow
+{
+    uint16_t x{ 0 };         /**< Top-left x coordinate */
+    uint16_t y{ 0 };         /**< Top-left y coordinate */
+    uint16_t width{ 0 };     /**< Width of the detection window */
+    uint16_t height{ 0 };    /**< Height of the detection window */
+    uint16_t idx_class{ 0 }; /**< Index of the class */
+    float    score{ 0.f };   /**< Confidence value for the detection window */
+};
+
+/** Dimension rounding type when down-scaling on CNNs
+ * @note Used in pooling and convolution layer
+ */
+enum class DimensionRoundingType
+{
+    FLOOR, /**< Floor rounding */
+    CEIL   /**< Ceil rounding */
+};
+
+/** Available pooling types */
+enum class PoolingType
+{
+    MAX, /**< Max Pooling */
+    AVG  /**< Average Pooling */
+};
+
+/** Padding and stride information class */
+class PadStrideInfo
+{
+public:
+    /** Constructor
+     *
+     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
+     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
+     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
+     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
+     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref FLOOR.
+     */
+    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
+                  unsigned int pad_x = 0, unsigned int pad_y = 0,
+                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad(std::make_pair(pad_x, pad_y)),
+          _round_type(round)
+    {
+    }
+    std::pair<unsigned int, unsigned int> stride() const
+    {
+        return _stride;
+    }
+    std::pair<unsigned int, unsigned int> pad() const
+    {
+        return _pad;
+    }
+    DimensionRoundingType round() const
+    {
+        return _round_type;
+    }
+
+private:
+    std::pair<unsigned int, unsigned int> _stride;
+    std::pair<unsigned int, unsigned int> _pad;
+    DimensionRoundingType _round_type;
+};
+
+/** Pooling Layer Information class */
+class PoolingLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] pool_type       Pooling type @ref PoolingType. Defaults to @ref PoolingType::MAX
+     * @param[in] pool_size       (Optional) Pooling size, in elements, across  x and y. Defaults to 2.
+     * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo
+     */
+    PoolingLayerInfo(PoolingType pool_type = PoolingType::MAX, unsigned int pool_size = 2, PadStrideInfo pad_stride_info = PadStrideInfo())
+        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info)
+    {
+    }
+    PoolingType pool_type() const
+    {
+        return _pool_type;
+    }
+    unsigned int pool_size() const
+    {
+        return _pool_size;
+    }
+    PadStrideInfo pad_stride_info() const
+    {
+        return _pad_stride_info;
+    }
+
+private:
+    PoolingType   _pool_type;
+    unsigned int  _pool_size;
+    PadStrideInfo _pad_stride_info;
+};
+
+/** Activation Layer Information class */
+class ActivationLayerInfo
+{
+public:
+    /** Available activation functions */
+    enum class ActivationFunction
+    {
+        LOGISTIC,     /**< Logistic */
+        TANH,         /**< Hyperbolic tangent */
+        RELU,         /**< Rectifier */
+        BOUNDED_RELU, /**< Bounded Rectifier */
+        SOFT_RELU,    /**< Soft Rectifier */
+        ABS,          /**< Absolute */
+        SQUARE,       /**< Square */
+        SQRT,         /**< Square root */
+        LINEAR        /**< Linear */
+    };
+
+    /** Default Constructor
+     *
+     * @param[in] f The activation function to use.
+     * @param[in] a (Optional) The alpha parameter used by some activation functions
+     *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+     * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+     */
+    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
+        : _act(f), _a(a), _b(b)
+    {
+    }
+    ActivationFunction activation() const
+    {
+        return _act;
+    }
+    float a() const
+    {
+        return _a;
+    }
+    float b() const
+    {
+        return _b;
+    }
+
+private:
+    ActivationFunction _act;
+    float              _a;
+    float              _b;
+};
+
+/** Normalization Layer Information class */
+class NormalizationLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP
+     * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5.
+     * @param[in] alpha     Alpha parameter used by normalization equation. Defaults to 0.0001.
+     * @param[in] beta      Beta parameter used by normalization equation. Defaults to 0.5.
+     * @param[in] kappa     Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation.
+     */
+    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f)
+        : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa)
+    {
+    }
+    NormType type() const
+    {
+        return _type;
+    }
+    uint32_t norm_size() const
+    {
+        return _norm_size;
+    }
+    float alpha() const
+    {
+        return _alpha;
+    }
+    float beta() const
+    {
+        return _beta;
+    }
+    float kappa() const
+    {
+        return _kappa;
+    }
+    /** Return the scaling factor of the normalization function. If kappa is not
+     * 1 then [Krichevksy 2012] normalization scaling is specified. Scaling
+     * factor takes into account the total number of elements used for the
+     * normalization, so in case of 2 dimensions this is _norm_size^2.
+     *
+     * @return The normalization scaling factor.
+     */
+    float scale_coeff() const
+    {
+        const uint32_t size = (_type == NormType::IN_MAP_2D) ? _norm_size * _norm_size : _norm_size;
+        return (_kappa == 1.f) ? (_alpha / size) : _alpha;
+    }
+
+private:
+    NormType _type;
+    uint32_t _norm_size;
+    float    _alpha;
+    float    _beta;
+    float    _kappa;
+};
+
+/** Convolution Layer Weights Information class */
+class WeightsInfo
+{
+public:
+    WeightsInfo()
+        : _are_reshaped(false), _kernel_size(0)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] are_reshaped True if the weights have been reshaped
+     * @param[in] kernel_size  The size of the kernel.
+     */
+    WeightsInfo(bool are_reshaped, unsigned int kernel_size)
+        : _are_reshaped(are_reshaped), _kernel_size(kernel_size)
+    {
+    }
+
+    bool are_reshaped() const
+    {
+        return _are_reshaped;
+    };
+    unsigned int kernel_size() const
+    {
+        return _kernel_size;
+    }
+
+private:
+    const bool         _are_reshaped;
+    const unsigned int _kernel_size;
+};
+
+/** IO formatting information class*/
+struct IOFormatInfo
+{
+    /** Precision type used when printing floating point numbers */
+    enum class PrecisionType
+    {
+        Default, /**< Default precision to the one that the current stream has */
+        Custom,  /**< Custom precision specified by the user using the precision parameter */
+        Full     /**< The maximum precision of the floating point representation */
+    };
+
+    /** Specifies the area to be printed, used by Tensor objects */
+    enum class PrintRegion
+    {
+        ValidRegion, /**< Prints the valid region of the Tensor object */
+        NoPadding,   /**< Prints the Tensor object without the padding */
+        Full         /**< Print the tensor object including padding */
+    };
+
+    IOFormatInfo(PrintRegion   print_region   = PrintRegion::ValidRegion,
+                 PrecisionType precision_type = PrecisionType::Default,
+                 unsigned int  precision      = 10,
+                 bool          align_columns  = true,
+                 std::string   element_delim  = " ",
+                 std::string   row_delim      = "\n")
+        : print_region(print_region),
+          precision_type(precision_type),
+          precision(precision),
+          element_delim(element_delim),
+          row_delim(row_delim),
+          align_columns(align_columns)
+    {
+    }
+
+    PrintRegion   print_region;
+    PrecisionType precision_type;
+    unsigned int  precision;
+    std::string   element_delim;
+    std::string   row_delim;
+    bool          align_columns;
+};
+}
+#endif /* __ARM_COMPUTE_TYPES_H__ */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
new file mode 100644
index 0000000000..9d3ff0a1bd
--- /dev/null
+++ b/arm_compute/core/Utils.h
@@ -0,0 +1,740 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_UTILS_H__
+#define __ARM_COMPUTE_UTILS_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace arm_compute
+{
+/** Computes the smallest number larger or equal to value that is a multiple of divisor. */
+template <typename S, typename T>
+inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return ((value + divisor - 1) / divisor) * divisor;
+}
+
+/** Computes the largest number smaller or equal to value that is a multiple of divisor. */
+template <typename S, typename T>
+inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return (value / divisor) * divisor;
+}
+
+/** Calculate the rounded up quotient of val / m. */
+template <typename S, typename T>
+constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
+{
+    return (val + m - 1) / m;
+}
+
+/** Returns the arm_compute library build information
+ *
+ * Contains the version number and the build options used to build the library
+ *
+ * @return The arm_compute library build information
+ */
+std::string build_information();
+
+/** Load an entire file in memory
+ *
+ * @param[in] filename Name of the file to read.
+ * @param[in] binary   Is it a binary file ?
+ *
+ * @return The content of the file.
+ */
+std::string read_file(const std::string &filename, bool binary);
+
+/** Return a value as a string
+ *
+ * @param[in] val Input value.
+ *
+ * @return Value represented as a string
+ */
+template <typename T>
+const std::string val_to_string(T val)
+{
+    return static_cast<const std::ostringstream &>(std::ostringstream() << val).str();
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t data_size_from_type(DataType data_type)
+{
+    switch(data_type)
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QS8:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+        case DataType::QS16:
+            return 2;
+        case DataType::F32:
+        case DataType::U32:
+        case DataType::S32:
+            return 4;
+        case DataType::F64:
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        case DataType::SIZET:
+            return sizeof(size_t);
+        default:
+            ARM_COMPUTE_ERROR("Invalid data type");
+            return 0;
+    }
+}
+
+/** The size in bytes of the pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline size_t pixel_size_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+            return 1;
+        case Format::U16:
+        case Format::S16:
+        case Format::F16:
+        case Format::UV88:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        case Format::U32:
+        case Format::S32:
+        case Format::F32:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
+            return 0;
+    }
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] dt Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t element_size_from_data_type(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QS8:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QS16:
+        case DataType::F16:
+            return 2;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            return 4;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+            return 0;
+    }
+}
+
+/** Return the data type used by a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline DataType data_type_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return DataType::U8;
+        case Format::U16:
+            return DataType::U16;
+        case Format::S16:
+            return DataType::S16;
+        case Format::U32:
+            return DataType::U32;
+        case Format::S32:
+            return DataType::S32;
+        case Format::F16:
+            return DataType::F16;
+        case Format::F32:
+            return DataType::F32;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Not supported data_type for given format");
+            return DataType::UNKNOWN;
+    }
+}
+
+/** Return the plane index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The plane index of the specific channel of the specific format
+ */
+inline int plane_idx_from_channel(Format format, Channel channel)
+{
+    switch(format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::IYUV:
+        case Format::YUV444:
+        {
+            switch(channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of planes for a given format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of planes for a given image format.
+ */
+inline size_t num_planes_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::U32:
+        case Format::F16:
+        case Format::F32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 1;
+        case Format::NV12:
+        case Format::NV21:
+            return 2;
+        case Format::IYUV:
+        case Format::YUV444:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of channels for a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of channels for a given image format.
+ */
+inline size_t num_channels_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+            return 1;
+        // Because the U and V channels are subsampled
+        // these formats appear like having only 2 channels:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::UV88:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            return 0;
+    }
+}
+
+/** Separate a 2D convolution into two 1D convolutions
+*
+* @param[in]  conv     2D convolution
+* @param[out] conv_col 1D vertical convolution
+* @param[out] conv_row 1D horizontal convolution
+* @param[in]  size     Size of the 2D convolution
+*
+* @return true if the separation was successful
+*/
+inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size)
+{
+    int32_t min_col     = -1;
+    int16_t min_col_val = -1;
+
+    for(int32_t i = 0; i < size; ++i)
+    {
+        if(conv[i] != 0 && (min_col < 0 || abs(min_col_val) > abs(conv[i])))
+        {
+            min_col     = i;
+            min_col_val = conv[i];
+        }
+    }
+
+    if(min_col < 0)
+    {
+        return false;
+    }
+
+    for(uint32_t j = 0; j < size; ++j)
+    {
+        conv_col[j] = conv[min_col + j * size];
+    }
+
+    for(uint32_t i = 0; i < size; i++)
+    {
+        if(static_cast<int>(i) == min_col)
+        {
+            conv_row[i] = 1;
+        }
+        else
+        {
+            int16_t coeff = conv[i] / conv[min_col];
+
+            for(uint32_t j = 1; j < size; ++j)
+            {
+                if(conv[i + j * size] != (conv_col[j] * coeff))
+                {
+                    return false;
+                }
+            }
+
+            conv_row[i] = coeff;
+        }
+    }
+
+    return true;
+}
+
+/** Calculate the scale of the given square matrix
+ *
+ * The scale is the absolute value of the sum of all the coefficients in the matrix.
+ *
+ * @note If the coefficients add up to 0 then the scale is set to 1.
+ *
+ * @param[in] matrix      Matrix coefficients
+ * @param[in] matrix_size Number of elements per side of the square matrix. (Number of coefficients = matrix_size * matrix_size).
+ *
+ * @return The absolute value of the sum of the coefficients if they don't add up to 0, otherwise 1.
+ */
+inline uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matrix_size)
+{
+    const size_t size = matrix_size * matrix_size;
+
+    return std::max(1, std::abs(std::accumulate(matrix, matrix + size, 0)));
+}
+
+/** Calculate accurary required by the horizontal and vertical convolution computations
+ *
+ * @param[in] conv_col Pointer to the vertical vector of the separated convolution filter
+ * @param[in] conv_row Pointer to the horizontal vector of the convolution filter
+ * @param[in] size     Number of elements per vector of the separated matrix
+ *
+ * @return The return type is a pair. The first element of the pair is the biggest data type needed for the first stage. The second
+ * element of the pair is the biggest data type needed for the second stage.
+ */
+inline std::pair<DataType, DataType> data_type_for_convolution(const int16_t *conv_col, const int16_t *conv_row, size_t size)
+{
+    DataType first_stage  = DataType::UNKNOWN;
+    DataType second_stage = DataType::UNKNOWN;
+
+    auto gez = [](const int16_t &v)
+    {
+        return v >= 0;
+    };
+
+    auto accu_neg = [](const int &first, const int &second)
+    {
+        return first + (second < 0 ? second : 0);
+    };
+
+    auto accu_pos = [](const int &first, const int &second)
+    {
+        return first + (second > 0 ? second : 0);
+    };
+
+    const bool only_positive_coefficients = std::all_of(conv_row, conv_row + size, gez) && std::all_of(conv_col, conv_col + size, gez);
+
+    if(only_positive_coefficients)
+    {
+        const int max_row_value = std::accumulate(conv_row, conv_row + size, 0) * UINT8_MAX;
+        const int max_value     = std::accumulate(conv_col, conv_col + size, 0) * max_row_value;
+
+        first_stage = (max_row_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
+
+        second_stage = (max_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
+    }
+    else
+    {
+        const int min_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_neg) * UINT8_MAX;
+        const int max_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_pos) * UINT8_MAX;
+        const int neg_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_neg);
+        const int pos_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_pos);
+        const int min_value      = neg_coeffs_sum * max_row_value + pos_coeffs_sum * min_row_value;
+        const int max_value      = neg_coeffs_sum * min_row_value + pos_coeffs_sum * max_row_value;
+
+        first_stage = ((INT16_MIN <= min_row_value) && (max_row_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
+
+        second_stage = ((INT16_MIN <= min_value) && (max_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
+    }
+
+    return std::make_pair(first_stage, second_stage);
+}
+
+/** Calculate the accuracy required by the squared convolution calculation.
+ *
+ *
+ * @param[in] conv Pointer to the squared convolution matrix
+ * @param[in] size The total size of the convolution matrix
+ *
+ * @return The return is the biggest data type needed to do the convolution
+ */
+inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t size)
+{
+    auto gez = [](const int16_t v)
+    {
+        return v >= 0;
+    };
+
+    const bool only_positive_coefficients = std::all_of(conv, conv + size, gez);
+
+    if(only_positive_coefficients)
+    {
+        const int max_conv_value = std::accumulate(conv, conv + size, 0) * UINT8_MAX;
+        if(max_conv_value <= UINT16_MAX)
+        {
+            return DataType::U16;
+        }
+        else
+        {
+            return DataType::S32;
+        }
+    }
+    else
+    {
+        const int min_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
+        {
+            return b < 0 ? a + b : a;
+        })
+        * UINT8_MAX;
+
+        const int max_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
+        {
+            return b > 0 ? a + b : a;
+        })
+        * UINT8_MAX;
+
+        if((INT16_MIN <= min_value) && (INT16_MAX >= max_value))
+        {
+            return DataType::S16;
+        }
+        else
+        {
+            return DataType::S32;
+        }
+    }
+}
+
+/** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width       Width of input tensor (Number of columns)
+ * @param[in] height      Height of input tensor (Number of rows)
+ * @param[in] kernel_size Kernel size.
+ * @param[in] stride_x    Stride of the operation in the x dimension.
+ * @param[in] stride_y    Stride of the operation in the y dimension.
+ * @param[in] pad_x       Padding size in the x dimension.
+ * @param[in] pad_y       Padding size in the y dimension.
+ * @param[in] round_type  Dimensions rounding mode.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+const std::pair<unsigned int, unsigned int> scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
+                                                              unsigned int stride_x, unsigned int stride_y,
+                                                              unsigned int pad_x, unsigned int pad_y,
+                                                              DimensionRoundingType round_type);
+
+/** Convert a tensor format into a string.
+ *
+ * @param[in] format @ref Format to be translated to string.
+ *
+ * @return The string describing the format.
+ */
+const std::string &string_from_format(Format format);
+
+/** Convert a channel identity into a string.
+ *
+ * @param[in] channel @ref Channel to be translated to string.
+ *
+ * @return The string describing the channel.
+ */
+const std::string &string_from_channel(Channel channel);
+
+/** Convert a data type identity into a string.
+ *
+ * @param[in] dt @ref DataType to be translated to string.
+ *
+ * @return The string describing the data type.
+ */
+const std::string &string_from_data_type(DataType dt);
+/** Convert a matrix pattern into a string.
+ *
+ * @param[in] pattern @ref MatrixPattern to be translated to string.
+ *
+ * @return The string describing the matrix pattern.
+ */
+const std::string &string_from_matrix_pattern(MatrixPattern pattern);
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act);
+/** Translates a given non linear function to a string.
+ *
+ * @param[in] function @ref NonLinearFilterFunction to be translated to string.
+ *
+ * @return The string describing the non linear function.
+ */
+const std::string &string_from_non_linear_filter_function(NonLinearFilterFunction function);
+/** Translates a given interpolation policy to a string.
+ *
+ * @param[in] policy @ref InterpolationPolicy to be translated to string.
+ *
+ * @return The string describing the interpolation policy.
+ */
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
+/** Translates a given border mode policy to a string.
+ *
+ * @param[in] border_mode @ref BorderMode to be translated to string.
+ *
+ * @return The string describing the border mode.
+ */
+const std::string &string_from_border_mode(BorderMode border_mode);
+/** Translates a given normalization type to a string.
+ *
+ * @param[in] type @ref NormType to be translated to string.
+ *
+ * @return The string describing the normalization type.
+ */
+const std::string &string_from_norm_type(NormType type);
+/** Lower a given string.
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The lowered string
+ */
+std::string lower_string(const std::string &val);
+
+/** Check if a given data type is of floating point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of floating point type, else false.
+ */
+inline bool is_data_type_float(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::F16:
+        case DataType::F32:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of fixed point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of fixed point type, else false.
+ */
+inline bool is_data_type_fixed_point(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+        case DataType::QS16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Print consecutive elements to an output stream.
+ *
+ * @param[out] s             Output stream to print the elements to.
+ * @param[in]  ptr           Pointer to print the elements from.
+ * @param[in]  n             Number of elements to print.
+ * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
+ * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
+ */
+template <typename T>
+void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+{
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        // Set stream width as it is not a "sticky" stream manipulator
+        if(stream_width != 0)
+        {
+            s.width(stream_width);
+        }
+        s << std::right << static_cast<print_type>(ptr[i]) << element_delim;
+    }
+}
+
+/** Identify the maximum width of n consecutive elements.
+ *
+ * @param[in] s   The output stream which will be used to print the elements. Used to extract the stream format.
+ * @param[in] ptr Pointer to the elements.
+ * @param[in] n   Number of elements.
+ *
+ * @return The maximum width of the elements.
+ */
+template <typename T>
+int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, unsigned int n)
+{
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
+    int max_width = -1;
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        std::stringstream ss;
+        ss.copyfmt(s);
+        ss << static_cast<print_type>(ptr[i]);
+        max_width = std::max<int>(max_width, ss.str().size());
+    }
+    return max_width;
+}
+
+/** Print consecutive elements to an output stream.
+ *
+ * @param[out] s             Output stream to print the elements to.
+ * @param[in]  dt            Data type of the elements
+ * @param[in]  ptr           Pointer to print the elements from.
+ * @param[in]  n             Number of elements to print.
+ * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
+ * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
+ */
+void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+
+/** Identify the maximum width of n consecutive elements.
+ *
+ * @param[in] s   Output stream to print the elements to.
+ * @param[in] dt  Data type of the elements
+ * @param[in] ptr Pointer to print the elements from.
+ * @param[in] n   Number of elements to print.
+ *
+ * @return The maximum width of the elements.
+ */
+int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
+}
+#endif /*__ARM_COMPUTE_UTILS_H__ */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
new file mode 100644
index 0000000000..48eba70adf
--- /dev/null
+++ b/arm_compute/core/Validate.h
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_VALIDATE_H__
+#define __ARM_COMPUTE_VALIDATE_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace detail
+{
+/* Check whether two dimension objects differ.
+ *
+ * @param[in] dim1      First object to be compared.
+ * @param[in] dim2      Second object to be compared.
+ * @param[in] upper_dim The dimension from which to check.
+ *
+ * @return Return true if the two objects are different.
+ */
+template <typename T>
+inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
+{
+    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    {
+        if(dim1[i] != dim2[i])
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/** Functor to compare two @ref Dimensions objects and throw an error on mismatch.
+ *
+ * @param[in] dim      Object to compare against.
+ * @param[in] function Function in which the error occured.
+ * @param[in] file     File in which the error occured.
+ * @param[in] line     Line in which the error occured.
+ */
+template <typename T>
+class compare_dimension
+{
+public:
+    compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
+        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+    {
+    }
+
+    /** Compare the given object against the stored one.
+     *
+     * @param[in] dim To be compared object.
+     */
+    void operator()(const Dimensions<T> &dim)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line,
+                                     "Objects have different dimensions");
+    }
+
+private:
+    const Dimensions<T> &_dim;
+    const char *const    _function;
+    const char *const    _file;
+    const int            _line;
+};
+} // namespace detail
+/** Throw an error if one of the pointers is a nullptr.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] pointers Pointers to check against nullptr.
+ */
+template <typename... Ts>
+void error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+{
+    auto is_nullptr = [&](const void *ptr)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(ptr == nullptr, function, file, line);
+    };
+
+    for_each(is_nullptr, std::forward<Ts>(pointers)...);
+}
+#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) ::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed window is invalid.
+ *
+ * The subwindow is invalid if:
+ * - It is not a valid window.
+ * - Its dimensions don't match the full window's ones
+ * - The step for each of its dimension is not identical to the corresponding one of the full window.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] full     Full size window
+ *  @param[in] win      Window to validate.
+ */
+void error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                  const Window &full, const Window &win);
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) ::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w)
+
+/** Throw an error if the passed subwindow is invalid.
+ *
+ * The subwindow is invalid if:
+ * - It is not a valid window.
+ * - It is not fully contained inside the full window
+ * - The step for each of its dimension is not identical to the corresponding one of the full window.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] full     Full size window
+ *  @param[in] sub      Sub-window to validate.
+ */
+void error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                const Window &full, const Window &sub);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) ::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s)
+
+/** Throw an error if the passed coordinates have too many dimensions.
+ *
+ * The coordinates have too many dimensions if any of the dimensions greater or equal to max_dim is different from 0.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] pos      Coordinates to validate
+ *  @param[in] max_dim  Maximum number of dimensions allowed.
+ */
+void error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                         const Coordinates &pos, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) ::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md)
+
+/** Throw an error if the passed window has too many dimensions.
+ *
+ * The window has too many dimensions if any of the dimension greater or equal to max_dim is different from 0.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] win      Window to validate
+ *  @param[in] max_dim  Maximum number of dimensions allowed.
+ */
+void error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                    const Window &win, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) ::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)
+
+/** Throw an error if the passed dimension objects differ.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] dim1     The first object to be compared.
+ *  @param[in] dim2     The second object to be compared.
+ *  @param[in] dims     (Optional) Further allowed objects.
+ */
+template <typename T, typename... Ts>
+void error_on_mismatching_dimensions(const char *function, const char *file, int line,
+                                     const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    for_each(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...);
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed two tensors have different shapes from the given dimension
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                 const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
+}
+
+/** Throw an error if the passed two tensors have different shapes from the given dimension
+ *
+ *  @param[in] function  Function in which the error occurred.
+ *  @param[in] file      Name of the file where the error occurred.
+ *  @param[in] line      Line on which the error occurred.
+ *  @param[in] upper_dim The dimension from which to check.
+ *  @param[in] tensor_1  The first tensor to be compared.
+ *  @param[in] tensor_2  The second tensor to be compared.
+ *  @param[in] tensors   (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                 unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    const std::array < const ITensor *, 2 + sizeof...(Ts) > tensors_array{ { tensor_1, tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensors_array.cbegin() == nullptr, function, file, line);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_array.cbegin()), tensors_array.cend(), [&](const ITensor * tensor)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+        return detail::have_different_dimensions((*tensors_array.cbegin())->info()->tensor_shape(), tensor->info()->tensor_shape(), upper_dim);
+    }),
+    function, file, line, "Tensors have different shapes");
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) ::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed two tensors have different data types
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_data_types(const char *function, const char *file, const int line,
+                                     const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_1);
+    ARM_COMPUTE_UNUSED(tensor_2);
+
+    DataType &&first_data_type = tensor_1->info()->data_type();
+    ARM_COMPUTE_UNUSED(first_data_type);
+
+    const std::array<const ITensor *, sizeof...(Ts)> tensors_array{ { std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_2->info()->data_type() != first_data_type || std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different data types");
+}
+
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed tensors have different fixed point data types or different fixed point positions
+ *
+ * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
+                                      const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_1);
+    ARM_COMPUTE_UNUSED(tensor_2);
+
+    DataType &&first_data_type            = tensor_1->info()->data_type();
+    const int  first_fixed_point_position = tensor_1->info()->fixed_point_position();
+    ARM_COMPUTE_UNUSED(first_data_type);
+    ARM_COMPUTE_UNUSED(first_fixed_point_position);
+
+    if((first_data_type != DataType::QS8) && (first_data_type != DataType::QS16))
+    {
+        return;
+    }
+
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different fixed point data types");
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != first_fixed_point_position;
+    }),
+    function, file, line, "Tensors have different fixed point positions");
+}
+
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) ::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] object   Tensor/multi-image to validate.
+ *  @param[in] format   First format allowed.
+ *  @param[in] formats  (Optional) Further allowed formats.
+ */
+template <typename T, typename F, typename... Fs>
+void error_on_format_not_in(const char *function, const char *file, const int line,
+                            const T *object, F &&format, Fs &&... formats)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
+
+    Format &&object_format = object->info()->format();
+    ARM_COMPUTE_UNUSED(object_format);
+
+    ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
+
+    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    ARM_COMPUTE_UNUSED(formats_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
+    {
+        return f == object_format;
+    }),
+    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+}
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+
+/** Throw an error if the data type of the passed tensor does not match any of the data types provided.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor   Tensor to validate.
+ *  @param[in] dt       First data type allowed.
+ *  @param[in] dts      (Optional) Further allowed data types.
+ */
+template <typename T, typename... Ts>
+void error_on_data_type_not_in(const char *function, const char *file, const int line,
+                               const ITensor *tensor, T &&dt, Ts &&... dts)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+
+    const DataType &tensor_dt = tensor->info()->data_type(); //NOLINT
+    ARM_COMPUTE_UNUSED(tensor_dt);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
+
+    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
+    ARM_COMPUTE_UNUSED(dts_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
+    {
+        return d == tensor_dt;
+    }),
+    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+}
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+
+/** Throw an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
+ *
+ *  @param[in] function     Function in which the error occurred.
+ *  @param[in] file         Name of the file where the error occurred.
+ *  @param[in] line         Line on which the error occurred.
+ *  @param[in] tensor       Tensor to validate.
+ *  @param[in] num_channels Number of channels to check
+ *  @param[in] dt           First data type allowed.
+ *  @param[in] dts          (Optional) Further allowed data types.
+ */
+template <typename T, typename... Ts>
+void error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
+                                       const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+{
+    error_on_data_type_not_in(function, file, line, tensor, std::forward<T>(dt), std::forward<Ts>(dts)...);
+
+    const size_t tensor_nc = tensor->info()->num_channels();
+    ARM_COMPUTE_UNUSED(tensor_nc);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels);
+}
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__)
+
+/** Throw an error if the tensor is not 2D.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor   Tensor to validate.
+ */
+void error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                            const ITensor *tensor);
+#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) ::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t)
+
+/** Throw an error if the channel is not in channels.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] cn       Input channel
+ *  @param[in] channel  First channel allowed.
+ *  @param[in] channels (Optional) Further allowed channels.
+ */
+template <typename T, typename... Ts>
+void error_on_channel_not_in(const char *function, const char *file, const int line,
+                             T cn, T &&channel, Ts &&... channels)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
+
+    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
+    ARM_COMPUTE_UNUSED(channels_array);
+    ARM_COMPUTE_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
+    {
+        return f == cn;
+    }),
+    function, file, line);
+}
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) ::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__)
+
+/** Throw an error if the channel is not in format.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] fmt      Input channel
+ *  @param[in] cn       First channel allowed.
+ */
+void error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                          Format fmt, Channel cn);
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) ::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c)
+
+/** Throw an error if the @ref IMultiHOG container is invalid
+ *
+ * An @ref IMultiHOG container is invalid if:
+ *
+ * -# it is a nullptr
+ * -# it doesn't contain models
+ * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM)
+ *
+ *  @param[in] function  Function in which the error occurred.
+ *  @param[in] file      Name of the file where the error occurred.
+ *  @param[in] line      Line on which the error occurred.
+ *  @param[in] multi_hog IMultiHOG container to validate
+ */
+void error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                const IMultiHOG *multi_hog);
+#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) ::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m)
+
+/** Throw an error if the kernel is not configured.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] kernel   Kernel to validate.
+ */
+void error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                  const IKernel *kernel);
+#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) ::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)
+
+/** Throw an error if if the coordinates and shape of the subtensor are within the parent tensor.
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] parent_shape Parent tensor shape
+ * @param[in] coords       Coordinates inside the parent tensor where the first element of the subtensor is
+ * @param[in] shape        Shape of the subtensor
+ */
+void error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) ::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)
+
+/** Throw an error if the valid region of a subtensor is not inside the valid region of the parent tensor.
+ *
+ * @param[in] function            Function in which the error occurred.
+ * @param[in] file                Name of the file where the error occurred.
+ * @param[in] line                Line on which the error occurred.
+ * @param[in] parent_valid_region Parent valid region.
+ * @param[in] valid_region        Valid region of subtensor.
+ */
+void error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                             const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)
+
+/** Throw an error if the input fixed-point positions are different.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
+                                               const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != tensor_1->info()->fixed_point_position();
+    }),
+    function, file, line, "Tensors have different fixed-point positions");
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) ::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the fixed-point value is not representable in the specified Q format.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] value    The floating point value to be checked.
+ *  @param[in] tensor   Input tensor that has information on data type and fixed-point position.
+ */
+template <typename... Ts>
+void error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
+                                                     float value, const ITensor *tensor)
+{
+    const int          fixed_point_position = tensor->info()->fixed_point_position();
+    const DataType     dt                   = tensor->info()->data_type();
+    const unsigned int q_max_range          = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1);
+    const float        max_range            = q_max_range / (static_cast<float>(1 << fixed_point_position));
+    ARM_COMPUTE_UNUSED(max_range);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
+                                 "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
+}
+#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) ::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+}
+#endif /* __ARM_COMPUTE_VALIDATE_H__*/
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
new file mode 100644
index 0000000000..6e7ef22531
--- /dev/null
+++ b/arm_compute/core/Window.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WINDOW_H__
+#define __ARM_COMPUTE_WINDOW_H__
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+/** Describe a multidimensional execution window. */
+class Window
+{
+public:
+    /** Alias for dimension 0 also known as X dimension */
+    static constexpr size_t DimX = 0;
+    /** Alias for dimension 1 also known as Y dimension */
+    static constexpr size_t DimY = 1;
+    /** Alias for dimension 2 also known as Z dimension */
+    static constexpr size_t DimZ = 2;
+
+    /** Default constructor: create a window containing a single element. */
+    constexpr Window()
+        : _dims(), _thread_id(0), _num_threads(1)
+    {
+    }
+    /** Copy constructor
+     *
+     * @param[in] src Copy the values from src to a new object
+     */
+    Window(const Window &src);
+
+    /** Describe one of the image's dimensions with a start, end and step.
+     *
+     * Iteration through the elements of the dimension is done like this:
+     * for(int v = start(); v < end(); v += step())
+     * {
+     *   ...
+     * }
+     */
+    class Dimension
+    {
+    public:
+        /** Constructor, by default creates a dimension of 1.
+         *
+         * @param[in] start Start of the dimension
+         * @param[in] end   End of the dimension
+         * @param[in] step  Step between two elements of the dimension when iterating.
+         *
+         */
+        constexpr Dimension(int start = 0, int end = 1, int step = 1)
+            : _start(start), _end(end), _step(step)
+        {
+        }
+        /** Default assignment operator to allow dimensions to be copied */
+        Dimension &operator=(const Dimension &d) = default;
+        /** Return the start of the dimension */
+        constexpr int start() const
+        {
+            return _start;
+        }
+        /** Return the end of the dimension */
+        constexpr int end() const
+        {
+            return _end;
+        }
+        /** Return the step of the dimension */
+        constexpr int step() const
+        {
+            return _step;
+        }
+        /** Set the dimension's step
+         *
+         * @param[in] step The new step
+         */
+        void set_step(int step)
+        {
+            _step = step;
+        }
+
+    private:
+        int _start; /**< Start of the dimension */
+        int _end;   /**< End of the dimension */
+        int _step;
+    };
+
+    /** Read only access to a given dimension of the window
+     *
+     * @note Precondition: dimension < Coordinates::num_max_dimensions
+     *
+     * @param[in] dimension The dimension to access
+     *
+     * @return The requested dimension
+     */
+    constexpr const Dimension &operator[](size_t dimension) const;
+
+    /** Alias to access the first dimension of the window
+     *
+     * @return First dimension of the window
+     */
+    constexpr const Dimension &x() const
+    {
+        return _dims.at(Window::DimX);
+    }
+
+    /** Alias to access the second dimension of the window
+     *
+     * @return Second dimension of the window
+     */
+    constexpr const Dimension &y() const
+    {
+        return _dims.at(Window::DimY);
+    }
+
+    /** Alias to access the third dimension of the window
+     *
+     * @return Third dimension of the window
+     */
+    constexpr const Dimension &z() const
+    {
+        return _dims.at(Window::DimZ);
+    }
+
+    /** Set the values of a given dimension
+     *
+     * @param[in] dimension The dimension to set
+     * @param[in] dim       The values to set the dimension to
+     */
+    void set(size_t dimension, const Dimension &dim);
+
+    /** Use the tensor's dimensions to fill the window dimensions.
+     *
+     * @param[in] info            Tensor information to copy the dimensions from.
+     * @param[in] first_dimension Only copy dimensions which are greater or equal to this value.
+     */
+    void use_tensor_dimensions(const ITensorInfo *info, size_t first_dimension = Window::DimX);
+
+    /** Shift the values of a given dimension by the given shift_value
+     *
+     * @param[in] dimension   The dimension to shift
+     * @param[in] shift_value Value to shift the start and end values of.
+     */
+    void shift(size_t dimension, int shift_value);
+
+    /** Adjust the start or end of a given dimension by the given value
+     *
+     * @param[in] dimension    The dimension to adjust
+     * @param[in] adjust_value The adjusted value.
+     * @param[in] is_at_start  The flag to indicate whether adjust the start or end of the dimension.
+     */
+    void adjust(size_t dimension, int adjust_value, bool is_at_start);
+
+    /** Scale the values of a given dimension by the given scale_value
+     *
+     * @note The end of the window is rounded up to be a multiple of step after the scaling.
+     *
+     * @param[in] dimension   The dimension to scale
+     * @param[in] scale_value Value to scale the start, end and step values of.
+     */
+    void scale(size_t dimension, float scale_value);
+
+    /** Set the step of a given dimension.
+     *
+     * @param[in] dimension Dimension to update
+     * @param[in] step      The new dimension's step value
+     */
+    void set_dimension_step(size_t dimension, int step);
+
+    /** Will validate all the window's dimensions' values when asserts are enabled
+     *
+     * No-op when asserts are disabled
+     */
+    void validate() const;
+
+    /** Return the number of iterations needed to iterate through a given dimension
+     *
+     * @param[in] dimension The requested dimension
+     *
+     * @return The number of iterations
+     */
+    constexpr size_t num_iterations(size_t dimension) const;
+
+    /** Split a window into a set of sub windows along a given dimension
+     *
+     * For example to split a window into 3 sub-windows along the Y axis, you would have to do:<br/>
+     * Window sub0 = window.split_window( 1, 0, 3);<br/>
+     * Window sub1 = window.split_window( 1, 1, 3);<br/>
+     * Window sub2 = window.split_window( 1, 2, 3);<br/>
+     *
+     * @param[in] dimension Dimension along which the split will be performed
+     * @param[in] id        Id of the sub-window to return. Must be in the range (0, total-1)
+     * @param[in] total     Total number of sub-windows the window will be split into.
+     *
+     * @return The subwindow "id" out of "total"
+     */
+    Window split_window(size_t dimension, size_t id, size_t total) const;
+    /** First 1D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_1D() const
+    {
+        return first_slice_window<1>();
+    };
+    /** First 2D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_2D() const
+    {
+        return first_slice_window<2>();
+    };
+    /** First 3D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_3D() const
+    {
+        return first_slice_window<3>();
+    };
+    /** Slide the passed 1D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_1D(Window &slice) const
+    {
+        return slide_window_slice<1>(slice);
+    }
+    /** Slide the passed 2D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_2D(Window &slice) const
+    {
+        return slide_window_slice<2>(slice);
+    }
+    /** Slide the passed 3D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_3D(Window &slice) const
+    {
+        return slide_window_slice<3>(slice);
+    }
+    /** Slide the passed 4D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_4D(Window &slice) const
+    {
+        return slide_window_slice<4>(slice);
+    }
+    /** Sets the ID of the thread that the window is associated with.
+     *
+     * @param id ID of the thread that the window is associated with.
+     */
+    void set_thread_id(unsigned int id)
+    {
+        _thread_id = id;
+    }
+    /** Sets the number of threads dispatched that the window is associated with.
+     *
+     * @param num_threads The number of threads dispatched that the window is associated with.
+     */
+    void set_num_threads(unsigned int num_threads)
+    {
+        _num_threads = num_threads;
+    }
+    /** Get the ID of the thread that the window is associated with.
+     *
+     * @return ID of the thread that the window is associated with.
+     */
+    constexpr unsigned int thread_id() const
+    {
+        return _thread_id;
+    }
+    /** Get the number of threads dispatched that the window is associated with.
+     *
+     * @return The number of threads dispatched that the window is associated with.
+     */
+    constexpr unsigned int num_threads() const
+    {
+        return _num_threads;
+    }
+
+private:
+    /** First slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    template <unsigned int window_dimension>
+    Window                 first_slice_window() const;
+
+    /** Slide the passed window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    template <unsigned int window_dimension>
+    bool slide_window_slice(Window &slice) const;
+
+private:
+    std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+    unsigned int _thread_id;
+    unsigned int _num_threads;
+};
+}
+#include "Window.inl"
+#endif /*__ARM_COMPUTE_WINDOW_H__ */
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
new file mode 100644
index 0000000000..75428a145b
--- /dev/null
+++ b/arm_compute/core/Window.inl
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+inline Window::Window(const Window &src)
+    : _dims(), _thread_id(src._thread_id), _num_threads(src._num_threads)
+{
+    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    {
+        set(i, src[i]);
+    }
+}
+
+inline constexpr const Window::Dimension &Window::operator[](const size_t dimension) const
+{
+    // Precondition: dimension < Coordinates::num_max_dimensions
+    return _dims.at(dimension);
+}
+inline void Window::set(const size_t dimension, const Window::Dimension &dim)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    _dims[dimension] = dim;
+}
+
+inline void Window::shift(const size_t dimension, const int shift_value)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d = _dims[dimension];
+    d                    = Window::Dimension(d.start() + shift_value, d.end() + shift_value, d.step());
+}
+
+inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d = _dims[dimension];
+
+    if(is_at_start)
+    {
+        d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
+    }
+    else
+    {
+        d = Window::Dimension(d.start(), d.end() + adjust_value, d.step());
+    }
+}
+
+inline void Window::scale(const size_t dimension, float scale_value)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d           = _dims[dimension];
+    const int          scaled_step = d.step() * scale_value;
+    const int          scaled_end  = ceil_to_multiple(d.end() * scale_value, scaled_step);
+    d                              = Window::Dimension(d.start() * scale_value, scaled_end, scaled_step);
+}
+
+inline void Window::set_dimension_step(const size_t dimension, const int step)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    _dims[dimension].set_step(step);
+}
+
+inline void Window::validate() const
+{
+    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(_dims[i].step() == 0);
+        ARM_COMPUTE_ERROR_ON(_dims[i].end() <= _dims[i].start());
+        ARM_COMPUTE_ERROR_ON((_dims[i].end() - _dims[i].start()) % _dims[i].step());
+    }
+}
+
+inline constexpr size_t Window::num_iterations(size_t dimension) const
+{
+    // Precondition: dimension < Coordinates::num_max_dimensions
+    // Precondition: (end - start) % step == 0
+    return (_dims.at(dimension).end() - _dims.at(dimension).start()) / _dims.at(dimension).step();
+}
+
+inline Window Window::split_window(const size_t dimension, const size_t id, const size_t total) const
+{
+    ARM_COMPUTE_ERROR_ON(id >= total);
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+
+    Window out;
+
+    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    {
+        if(d == dimension)
+        {
+            int start          = _dims[d].start();
+            int end            = _dims[d].end();
+            int per_sub_window = (num_iterations(d) / total) * _dims[d].step();
+
+            start += id * per_sub_window;
+
+            if(id != total - 1)
+            {
+                end = start + per_sub_window;
+            }
+
+            out.set(d, Dimension(start, end, _dims[d].step()));
+        }
+        else
+        {
+            out.set(d, _dims[d]);
+        }
+    }
+
+    return out;
+}
+
+template <unsigned int window_dimension>
+inline bool Window::slide_window_slice(Window &slice) const
+{
+    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    {
+        // Did we reach the end of this dimension?
+        const int v = slice._dims[n].start() + 1;
+
+        if(v < _dims[n].end())
+        {
+            // No: increment
+            slice._dims[n] = Dimension(v, v + 1, 1);
+
+            // Reset lower dimensions:
+            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            {
+                slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
+            }
+            return true;
+        }
+    }
+
+    // It was the last slice
+    return false; // Iteration over
+}
+
+template <unsigned int window_dimension>
+inline Window          Window::first_slice_window() const
+{
+    Window slice;
+
+    std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
+
+    //Initialise higher dimensions to be the first slice.
+    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    {
+        slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
+    }
+
+    return slice;
+}
+
+inline void Window::use_tensor_dimensions(const ITensorInfo *info, const size_t first_dimension)
+{
+    for(unsigned int n = first_dimension; n < info->num_dimensions(); ++n)
+    {
+        set(n, Window::Dimension(0, std::max(info->dimension(n), static_cast<size_t>(1))));
+    }
+}
+}
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
new file mode 100644
index 0000000000..c8a240e428
--- /dev/null
+++ b/arm_compute/runtime/Array.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ARRAY_H__
+#define __ARM_COMPUTE_ARRAY_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the IArray interface which allocates a static number of T values  */
+template <class T>
+class Array : public IArray<T>
+{
+public:
+    /** Default constructor: empty array */
+    Array()
+        : IArray<T>(0), _values(nullptr)
+    {
+    }
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    Array(size_t max_num_values)
+        : IArray<T>(max_num_values), _values(arm_compute::cpp14::make_unique<T[]>(max_num_values))
+    {
+    }
+
+    // Inherited methods overridden:
+    T *buffer() const override
+    {
+        return _values.get();
+    }
+
+private:
+    std::unique_ptr<T[]> _values;
+};
+
+using KeyPointArray        = Array<KeyPoint>;
+using Coordinates2DArray   = Array<Coordinates2D>;
+using DetectionWindowArray = Array<DetectionWindow>;
+using Size2DArray          = Array<Size2D>;
+using UInt8Array           = Array<uint8_t>;
+using UInt16Array          = Array<uint16_t>;
+using UInt32Array          = Array<uint32_t>;
+using Int16Array           = Array<int16_t>;
+using Int32Array           = Array<int32_t>;
+using FloatArray           = Array<float>;
+}
+#endif /* __ARM_COMPUTE_ARRAY_H__ */
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
new file mode 100644
index 0000000000..f4c2ef06d9
--- /dev/null
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARRAY_H__
+#define __ARM_COMPUTE_CLARRAY_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+/** CLArray implementation  */
+template <class T>
+class CLArray : public ICLArray<T>
+{
+public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArray(const CLArray &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLArray &operator=(const CLArray &) = delete;
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    CLArray(size_t max_num_values)
+        : ICLArray<T>(max_num_values), _buffer(cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T)))
+    {
+    }
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true)
+    {
+        ICLArray<T>::map(CLScheduler::get().queue(), blocking);
+    }
+    using ICLArray<T>::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap()
+    {
+        ICLArray<T>::unmap(CLScheduler::get().queue());
+    }
+    using ICLArray<T>::unmap;
+
+    // Inherited methods overridden:
+    const cl::Buffer &cl_buffer() const override
+    {
+        return _buffer;
+    }
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
+    {
+        ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
+        return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+    }
+    void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
+    {
+        ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
+        q.enqueueUnmapMemObject(_buffer, mapping);
+    }
+
+private:
+    cl::Buffer _buffer;
+};
+
+using CLKeyPointArray        = CLArray<KeyPoint>;
+using CLCoordinates2DArray   = CLArray<Coordinates2D>;
+using CLDetectionWindowArray = CLArray<DetectionWindow>;
+using CLSize2DArray          = CLArray<Size2D>;
+using CLUInt8Array           = CLArray<cl_uchar>;
+using CLUInt16Array          = CLArray<cl_ushort>;
+using CLUInt32Array          = CLArray<cl_uint>;
+using CLInt16Array           = CLArray<cl_short>;
+using CLInt32Array           = CLArray<cl_int>;
+using CLFloatArray           = CLArray<cl_float>;
+}
+#endif /* __ARM_COMPUTE_CLARRAY_H__ */
diff --git a/arm_compute/runtime/CL/CLDistribution1D.h b/arm_compute/runtime/CL/CLDistribution1D.h
new file mode 100644
index 0000000000..55dd1247ed
--- /dev/null
+++ b/arm_compute/runtime/CL/CLDistribution1D.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_CLDISTRIBUTION1D_H__
+
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** CLDistribution1D object class */
+class CLDistribution1D : public ICLDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDistribution1D(const CLDistribution1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDistribution1D &operator=(const CLDistribution1D &) = delete;
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLDistribution1D::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLDistribution1D::unmap;
+
+    // Inherited methods overridden:
+    cl::Buffer &cl_buffer() override;
+
+protected:
+    // Inherited methods overridden:
+    uint32_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    cl::Buffer _mem;
+};
+}
+#endif /* __ARM_COMPUTE_CLDISTRIBUTION1D_H__ */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
new file mode 100644
index 0000000000..82929ba139
--- /dev/null
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFUNCTIONS_H__
+#define __ARM_COMPUTE_CLFUNCTIONS_H__
+
+/* Header regrouping all the CL functions */
+#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
+#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/CL/functions/CLDilate.h"
+#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
+#include "arm_compute/runtime/CL/functions/CLErode.h"
+#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
+#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
+#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
+#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
+#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+#include "arm_compute/runtime/CL/functions/CLPhase.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLRemap.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
+#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+
+#endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
new file mode 100644
index 0000000000..9b4a303eca
--- /dev/null
+++ b/arm_compute/runtime/CL/CLHOG.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOG_H__
+#define __ARM_COMPUTE_CLHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** OpenCL implementation of HOG data-object */
+class CLHOG : public ICLHOG
+{
+public:
+    /** Default constructor */
+    CLHOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLHOG::map;
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLHOG::unmap;
+
+    // Inherited method overridden:
+    void              free() override;
+    const HOGInfo    *info() const override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    HOGInfo    _info;
+    cl::Buffer _buffer;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLLut.h b/arm_compute/runtime/CL/CLLut.h
new file mode 100644
index 0000000000..9bac2b44c3
--- /dev/null
+++ b/arm_compute/runtime/CL/CLLut.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLUT_H__
+#define __ARM_COMPUTE_CLLUT_H__
+
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLLutAllocator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ILutAllocator;
+
+/** Basic implementation of the OpenCL lut interface */
+class CLLut : public ICLLut
+{
+public:
+    /** Constructor */
+    CLLut();
+    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    CLLut(size_t num_elements, DataType data_type);
+    /** Return a pointer to the lut's allocator
+     *
+     * @return A pointer to the lut's allocator
+     */
+    ILutAllocator *allocator();
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLLut::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLLut::unmap;
+
+    // Inherited methods overridden:
+    size_t            num_elements() const override;
+    uint32_t          index_offset() const override;
+    size_t            size_in_bytes() const override;
+    DataType          type() const override;
+    const cl::Buffer &cl_buffer() const override;
+    void              clear() override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    CLLutAllocator _allocator; /**< Instance of the OpenCL lut allocator */
+};
+}
+#endif /*__ARM_COMPUTE_CLLUT_H__ */
diff --git a/arm_compute/runtime/CL/CLLutAllocator.h b/arm_compute/runtime/CL/CLLutAllocator.h
new file mode 100644
index 0000000000..4648ffb51f
--- /dev/null
+++ b/arm_compute/runtime/CL/CLLutAllocator.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLUTALLOCATOR_H__
+#define __ARM_COMPUTE_CLLUTALLOCATOR_H__
+
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic implementation of a CL memory LUT allocator. */
+class CLLutAllocator : public ILutAllocator
+{
+public:
+    /** Default constructor. */
+    CLLutAllocator();
+    /** Default destructor. */
+    ~CLLutAllocator() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLLutAllocator(const CLLutAllocator &) = delete;
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    const CLLutAllocator &operator=(const CLLutAllocator &) = delete;
+    /** Interface to be implemented by the child class to return the pointer to the mapped data. */
+    uint8_t *data();
+    /** Interface to be implemented by the child class to return the pointer to the CL data. */
+    const cl::Buffer &cl_data() const;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    uint8_t *map(cl::CommandQueue &q, bool blocking);
+    /** Enqueue an unmap operation of the allocated buffer on the given queue.
+     *
+     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping The cpu mapping to unmap.
+     */
+    void unmap(cl::CommandQueue &q, uint8_t *mapping);
+
+protected:
+    /** Allocate num_elements() * sizeof(type()) of OpenCL memory. */
+    void allocate() override;
+    /** Call map() on the OpenCL buffer.
+     *
+     * @return A pointer to the beginning of the LUT's allocation.
+     */
+    uint8_t *lock() override;
+    /** Call unmap() on the OpenCL buffer. */
+    void unlock() override;
+
+private:
+    cl::Buffer _buffer;  /**< OpenCL buffer containing the LUT data. */
+    uint8_t   *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLLUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/runtime/CL/CLMultiHOG.h
new file mode 100644
index 0000000000..17bb4e03c1
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIHOG_H__
+#define __ARM_COMPUTE_CLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the CL multi HOG data-objects */
+class CLMultiHOG : public ICLMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    CLMultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t  num_models() const override;
+    ICLHOG *cl_model(size_t index) override;
+    const ICLHOG *cl_model(size_t index) const override;
+
+private:
+    size_t                   _num_models;
+    std::unique_ptr<CLHOG[]> _model;
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h
new file mode 100644
index 0000000000..f70929db07
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiImage.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIIMAGE_H__
+#define __ARM_COMPUTE_CLMULTIIMAGE_H__
+
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic implementation of the CL multi-planar image interface */
+class CLMultiImage : public ICLMultiImage
+{
+public:
+    /** Constructor */
+    CLMultiImage();
+    /** Init the multi-planar image
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Heigth of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Init the multi-planar image
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init_auto_padding(unsigned int width, unsigned int height, Format format);
+    /** Allocated a previously initialised multi image
+     *
+     * @note The multi image must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+
+    // Inherited methods overridden:
+    const MultiImageInfo *info() const override;
+    CLImage *cl_plane(unsigned int index) override;
+    const CLImage *cl_plane(unsigned int index) const override;
+
+private:
+    /** Init the multi-planar image
+     *
+     *  @param[in] width        Width of the whole image
+     *  @param[in] height       Height of the whole image
+     *  @param[in] format       Format of the whole image
+     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     */
+    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
+
+    MultiImageInfo _info;          /** Instance of the multi-planar image's meta data */
+    std::array<CLImage, 3> _plane; /* Instance CLImage to hold the planar's information */
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIIMAGE_H__ */
diff --git a/arm_compute/runtime/CL/CLPyramid.h b/arm_compute/runtime/CL/CLPyramid.h
new file mode 100644
index 0000000000..5e0afb3c63
--- /dev/null
+++ b/arm_compute/runtime/CL/CLPyramid.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPYRAMID_H__
+#define __ARM_COMPUTE_CLPYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstddef>
+#include <memory>
+
+namespace arm_compute
+{
+class CLTensor;
+
+/** Basic implementation of the OpenCL pyramid interface */
+class CLPyramid : public IPyramid
+{
+public:
+    /** Default constructor */
+    CLPyramid();
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init(const PyramidInfo &info);
+
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init_auto_padding(const PyramidInfo &info);
+
+    /** Allocate the planes in the pyramid
+     *
+     * @note The pyramid must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+
+    // Inherited method overridden
+    const PyramidInfo *info() const override;
+    CLTensor *get_pyramid_level(size_t index) const override;
+
+private:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info         Pyramid's metadata
+     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
+     */
+    void internal_init(const PyramidInfo &info, bool auto_padding);
+
+    PyramidInfo                 _info;
+    std::unique_ptr<CLTensor[]> _pyramid;
+};
+}
+#endif /*__ARM_COMPUTE_CLPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
new file mode 100644
index 0000000000..8e80259b59
--- /dev/null
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHEDULER_H__
+#define __ARM_COMPUTE_CLSCHEDULER_H__
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLKernel;
+
+/** Provides global access to a CL context and command queue. */
+class CLScheduler
+{
+private:
+    /** Constructor */
+    CLScheduler();
+
+public:
+    /** Access the scheduler singleton.
+     *
+     * @return The scheduler
+     */
+    static CLScheduler &get();
+    /** Initialises the context and command queue used by the scheduler to default values
+     *  and sets a default device and kernel path for the @ref CLKernelLibrary.
+     */
+    void default_init()
+    {
+        CLKernelLibrary::get().init("./cl_kernels/", cl::Context::getDefault(), cl::Device::getDefault());
+        init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault());
+    }
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel Kernel to execute.
+     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
+     */
+    void enqueue(ICLKernel &kernel, bool flush = true);
+
+    /** Initialises the context and command queue to be used by the scheduler.
+     *
+     * @param[in] context A CL context.
+     * @param[in] queue   A CL command queue.
+     * @param[in] device  A CL device.
+     */
+    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault(),
+              cl::Device device = cl::Device::getDefault())
+    {
+        _context = std::move(context);
+        _queue   = std::move(queue);
+        _target  = get_target_from_device(device);
+    }
+
+    /** Accessor for the associated CL context.
+     *
+     * @return A CL context.
+     */
+    cl::Context &context()
+    {
+        return _context;
+    }
+
+    /** Accessor to set the CL context to be used by the scheduler.
+     *
+     * @param[in] context A CL context.
+     */
+    void set_context(cl::Context context)
+    {
+        _context = std::move(context);
+    }
+
+    /** Accessor for the associated CL command queue.
+     *
+     * @return A CL command queue.
+     */
+    cl::CommandQueue &queue()
+    {
+        return _queue;
+    }
+
+    /** Get the target GPU.
+     *
+     * @return The target GPU.
+     */
+    GPUTarget target() const
+    {
+        return _target;
+    }
+
+    /** Accessor to set the CL command queue to be used by the scheduler.
+     *
+     * @param[in] queue A CL command queue.
+     */
+    void set_queue(cl::CommandQueue queue)
+    {
+        _queue = std::move(queue);
+    }
+
+    /** Accessor to set target GPU to be used by the scheduler.
+     *
+     * @param[in] target The target GPU.
+     */
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
+
+    /** Blocks until all commands in the associated command queue have finished. */
+    void sync()
+    {
+        _queue.finish();
+    }
+
+    /** Enqueues a marker into the associated command queue and return the event.
+     *
+     * @return An event that can be waited on to block the executing thread.
+     */
+    cl::Event enqueue_sync_event()
+    {
+        cl::Event event;
+        _queue.enqueueMarker(&event);
+
+        return event;
+    }
+
+private:
+    cl::Context      _context;
+    cl::CommandQueue _queue;
+    GPUTarget        _target;
+};
+}
+#endif /* __ARM_COMPUTE_CLSCHEDULER_H__ */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
new file mode 100644
index 0000000000..4bab164779
--- /dev/null
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSUBTENSOR_H__
+#define __ARM_COMPUTE_CLSUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL sub-tensor interface */
+class CLSubTensor : public ICLTensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~CLSubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    CLSubTensor(const CLSubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    CLSubTensor &operator=(const CLSubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    CLSubTensor(CLSubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSubTensor &operator=(CLSubTensor &&) = default;
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @note Mapping a subtensor will lead to the mapping of the whole parent tensor for now.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note Unmapping a subtensor will lead to the unmapping of the whole parent tensor for now.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ICLTensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo      *info() const override;
+    ITensorInfo      *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    ICLTensor            *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_CLSUBTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
new file mode 100644
index 0000000000..2c685d1ed1
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTENSOR_H__
+#define __ARM_COMPUTE_CLTENSOR_H__
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorAllocator;
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL tensor interface */
+class CLTensor : public ICLTensor
+{
+public:
+    /** Constructor */
+    CLTensor();
+    /** Return a pointer to the tensor's allocator
+     *
+     * @return A pointer to the tensor's allocator
+     */
+    ITensorAllocator *allocator();
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    // Inherited methods overridden:
+    TensorInfo       *info() const override;
+    TensorInfo       *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
+};
+
+using CLImage = CLTensor;
+}
+#endif /*__ARM_COMPUTE_CLTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
new file mode 100644
index 0000000000..ed371e0642
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTENSORALLOCATOR_H__
+#define __ARM_COMPUTE_CLTENSORALLOCATOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic implementation of a CL memory tensor allocator. */
+class CLTensorAllocator : public ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    CLTensorAllocator();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLTensorAllocator(const CLTensorAllocator &) = delete;
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    CLTensorAllocator &operator=(const CLTensorAllocator &) = delete;
+    /** Allow instances of this class to be moved */
+    CLTensorAllocator(CLTensorAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    CLTensorAllocator &operator=(CLTensorAllocator &&) = default;
+    /** Default destructor */
+    ~CLTensorAllocator() = default;
+
+    /** Interface to be implemented by the child class to return the pointer to the mapped data. */
+    uint8_t *data();
+    /** Interface to be implemented by the child class to return the pointer to the CL data. */
+    const cl::Buffer &cl_data() const;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    uint8_t *map(cl::CommandQueue &q, bool blocking);
+    /** Enqueue an unmap operation of the allocated buffer on the given queue.
+     *
+     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping The cpu mapping to unmap.
+     */
+    void unmap(cl::CommandQueue &q, uint8_t *mapping);
+
+    /** Allocate size specified by TensorInfo of OpenCL memory.
+     *
+     * @note: The tensor must not already be allocated when calling this function.
+     *
+     */
+    void allocate() override;
+
+    /** Free allocated OpenCL memory.
+     *
+     * @note The tensor must have been allocated when calling this function.
+     *
+     */
+    void free() override;
+
+protected:
+    /** Call map() on the OpenCL buffer.
+     *
+     * @return A pointer to the beginning of the tensor's allocation.
+     */
+    uint8_t *lock() override;
+    /** Call unmap() on the OpenCL buffer. */
+    void unlock() override;
+
+private:
+    cl::Buffer _buffer;  /**< OpenCL buffer containing the tensor data. */
+    uint8_t   *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */
+};
+}
+#endif /* __ARM_COMPUTE_CLTENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/CL/ICLSimpleFunction.h b/arm_compute/runtime/CL/ICLSimpleFunction.h
new file mode 100644
index 0000000000..130c58a98c
--- /dev/null
+++ b/arm_compute/runtime/CL/ICLSimpleFunction.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_ICLSIMPLEFUNCTION_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single OpenCL kernel */
+class ICLSimpleFunction : public IFunction
+{
+public:
+    /** Default constructor */
+    ICLSimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<ICLKernel> _kernel;         /**< Kernel to run */
+    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
new file mode 100644
index 0000000000..40ee396644
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__
+#define __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLAbsoluteDifferenceKernel
+ *
+ * @note The tensor data types for the inputs must be U8 or S16.
+ * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
+ */
+class CLAbsoluteDifference : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8, S16
+     * @param[in]  input2 Second input tensor. Data types supported: U8, S16
+     * @param[out] output Output tensor. Data types supported: U8, S16
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
new file mode 100644
index 0000000000..51f6df9acb
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLAccumulate.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACCUMULATE_H__
+#define __ARM_COMPUTE_CLACCUMULATE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLAccumulateKernel */
+class CLAccumulate : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  input Source tensor. Data types supported: U8.
+     * @param[out] accum Destination tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *accum);
+};
+
+/** Basic function to run @ref CLAccumulateWeightedKernel */
+class CLAccumulateWeighted : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors, and the scale value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     alpha The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
+};
+
+/** Basic function to run @ref CLAccumulateSquaredKernel */
+class CLAccumulateSquared : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     shift The input with a value input the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+};
+}
+#endif /*__ARM_COMPUTE_CLACCUMULATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
new file mode 100644
index 0000000000..6468c996a2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYER_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLActivationLayerKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class CLActivationLayer : public ICLSimpleFunction
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input    Source tensor. Data types supported: F16, F32, U16, S16.
+     * @param[out] output   Destination tensor. Data type should match the input data type.
+     * @param[in]  act_info Activation layer parameters.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
new file mode 100644
index 0000000000..feadf39820
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICADDITION_H__
+#define __ARM_COMPUTE_CLARITHMETICADDITION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticAdditionKernel
+ *
+ * @note The tensor data type for the inputs must be U8, S16, F16, F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class CLArithmeticAddition : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
new file mode 100644
index 0000000000..d7bb21144e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticSubtractionKernel
+ *
+ * @note The tensor data type for the inputs must be U8, S16, F16, F32
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtraction : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
new file mode 100644
index 0000000000..d766d1c69c
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class CLBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+};
+}
+#endif /* __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
new file mode 100644
index 0000000000..a4a523baaa
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEAND_H__
+#define __ARM_COMPUTE_CLBITWISEAND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseAndKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise AND operation using the two input tensors.
+ */
+class CLBitwiseAnd : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEAND_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
new file mode 100644
index 0000000000..0ff16af870
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISENOT_H__
+#define __ARM_COMPUTE_CLBITWISENOT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseNotKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise NOT operation on input tensor.
+ */
+class CLBitwiseNot : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISENOT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
new file mode 100644
index 0000000000..880c4762be
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEOR_H__
+#define __ARM_COMPUTE_CLBITWISEOR_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseOrKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise OR operation using the two input tensors.
+ */
+class CLBitwiseOr : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
new file mode 100644
index 0000000000..772dec22ea
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEXOR_H__
+#define __ARM_COMPUTE_CLBITWISEXOR_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseXorKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise XOR operation using the two input tensors.
+ */
+class CLBitwiseXor : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEXOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
new file mode 100644
index 0000000000..5e51c1a390
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBox3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBOX3X3_H__
+#define __ARM_COMPUTE_CLBOX3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ *  -# @ref CLBox3x3Kernel
+ *
+ */
+class CLBox3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLBOX3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
new file mode 100644
index 0000000000..e5a82b2263
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCANNYEDGE_H__
+#define __ARM_COMPUTE_CLCANNYEDGE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
+ * -# @ref CLSobel3x3 (if gradient_size == 3) or @ref CLSobel5x5 (if gradient_size == 5) or @ref CLSobel7x7 (if gradient_size == 7)
+ * -# @ref CLGradientKernel
+ * -# @ref CLEdgeNonMaxSuppressionKernel
+ * -# @ref CLEdgeTraceKernel
+ *
+ */
+class CLCannyEdge : public IFunction
+{
+public:
+    /** Constructor */
+    CLCannyEdge();
+    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
+     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
+     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
+     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    virtual void run() override;
+
+private:
+    std::unique_ptr<IFunction>    _sobel;                                           /**< Pointer to Sobel kernel. */
+    CLGradientKernel              _gradient;                                        /**< Gradient kernel. */
+    CLFillBorderKernel            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
+    CLEdgeNonMaxSuppressionKernel _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
+    CLEdgeTraceKernel             _edge_trace;                                      /**< Edge tracing kernel. */
+    CLImage                       _gx;                                              /**< Source tensor - Gx component. */
+    CLImage                       _gy;                                              /**< Source tensor - Gy component. */
+    CLImage                       _mag;                                             /**< Source tensor - Magnitude. */
+    CLImage                       _phase;                                           /**< Source tensor - Phase. */
+    CLImage                       _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
+    CLImage                       _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLCANNYEDGE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
new file mode 100644
index 0000000000..337e6b4820
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELCOMBINE_H__
+#define __ARM_COMPUTE_CLCHANNELCOMBINE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLChannelCombineKernel to perform channel combination. */
+class CLChannelCombine : public ICLSimpleFunction
+{
+public:
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output The single planar output tensor.
+     */
+    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output The multi planar output image.
+     */
+    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLCHANNELCOMBINE_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
new file mode 100644
index 0000000000..1753374622
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELEXTRACT_H__
+#define __ARM_COMPUTE_CLCHANNELEXTRACT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction. */
+class CLChannelExtract : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Must be of U8 format.
+     */
+    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted 2D channel. Must be of U8 format.
+     */
+    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLCHANNELEXTRACT_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
new file mode 100644
index 0000000000..12457a0cf2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLColorConvert.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOLORCONVERT_H__
+#define __ARM_COMPUTE_CLCOLORCONVERT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLColorConvertKernel
+ *
+ * @note The function performs color convert between images.
+ */
+class CLColorConvert : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The input single-planar tensor from which to convert
+     * @param[in] output The converted single-planar output tensor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted single-planar output image
+     */
+    void configure(const ICLMultiImage *input, ICLImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The single-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const ICLImage *input, ICLMultiImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const ICLMultiImage *input, ICLMultiImage *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLCOLORCONVERT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
new file mode 100644
index 0000000000..f526f6ff4a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTION_H__
+#define __ARM_COMPUTE_CLCONVOLUTION_H__
+
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolution3x3Kernel
+ *
+ */
+class CLConvolution3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+
+/** Basic function to execute square convolution.Currently it supports 5x5, 7x7, 9x9. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolutionKernel or<br/>
+ *    @ref CLSeparableConvolutionHorKernel and @ref CLSeparableConvolutionVertKernel (if convolution matrix is separable)
+ *
+ */
+template <unsigned int matrix_size>
+class CLConvolutionSquare : public IFunction
+{
+public:
+    /** Default constructor */
+    CLConvolutionSquare();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overriden:
+    void run() override;
+
+private:
+    CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                          _is_separable;   /**< true if the convolution can be separated */
+    CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    CLSeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    CLConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
+    CLFillBorderKernel                            _border_handler; /**< kernel for border handling */
+};
+
+/** Basic function to run 5x5 convolution. */
+using CLConvolution5x5 = CLConvolutionSquare<5>;
+/** Basic function to run 7x7 convolution. */
+using CLConvolution7x7 = CLConvolutionSquare<7>;
+/** Basic function to run 9x9 convolution. */
+using CLConvolution9x9 = CLConvolutionSquare<9>;
+
+/** Basic function to execute non-square convolution. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolutionRectangleKernel or<br/>
+ *
+ * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ */
+class CLConvolutionRectangle : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     rows                  Rows of convolution kernel.
+     * @param[in]     cols                  Columns of convolution kernel.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLCONVOLUTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
new file mode 100644
index 0000000000..6a40396f9a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Function to reshape and transpose the weights. This function calls the following kernels:
+ * -# @ref CLWeightsReshapeKernel
+ * -# @ref CLGEMMTranspose1xWKernel
+ */
+class CLConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
+    CLTensor                               _weights_reshaped;
+    bool                                   _transpose1xW;
+};
+
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLGEMMTranspose1xWKernel               (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLGEMMInterleave4x4Kernel
+ * -# @ref CLGEMMMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLConvolutionLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: F16, F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerReshapeWeights _reshape_weights;
+    CLIm2ColKernel                   _input_im2col_kernel;
+    CLGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    CLGEMMMatrixMultiplyKernel       _mm_kernel;
+    CLCol2ImKernel                   _output_col2im_kernel;
+    CLTensor                         _input_im2col_reshaped;
+    CLTensor                         _input_interleaved_reshaped;
+    CLTensor                         _weights_reshaped;
+    CLTensor                         _weights_transposed;
+    CLTensor                         _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
+};
+}
+#endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
new file mode 100644
index 0000000000..3199936b82
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+class CLDepthConcatenateKernel;
+class CLFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref CLDepthConcatenateKernel
+ *
+ */
+class CLDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ICLTensor *>                    _inputs_vector;
+    std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<CLFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvert.h b/arm_compute/runtime/CL/functions/CLDepthConvert.h
new file mode 100644
index 0000000000..f11027656d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConvert.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONVERT_H__
+#define __ARM_COMPUTE_CLDEPTHCONVERT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthConvertKernel. */
+class CLDepthConvert : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * Input data type must be different than output data type.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - U8 -> U16, S16, U32, S32
+     *   - U16 -> U8, U32, S32
+     *   - S16 -> U8, U32, S32
+     *   - U32 -> U8, U16, S16
+     *   - S32 -> U8, U16, S16
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[out] output The output tensor. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+#endif /*__ARM_COMPUTE_CLDEPTHCONVERT_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
new file mode 100644
index 0000000000..05033e8172
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDerivative.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDERIVATIVE_H__
+#define __ARM_COMPUTE_CLDERIVATIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute first order derivative operator. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLDerivativeKernel
+ *
+ */
+class CLDerivative : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /* __ARM_COMPUTE_CLDERIVATIVE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
new file mode 100644
index 0000000000..8534139c86
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDilate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDILATE_H__
+#define __ARM_COMPUTE_CLDILATE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute dilate. This function calls the following OpenCL kernels:
+*
+* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+* -# @ref CLDilateKernel
+*
+*/
+class CLDilate : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode.
+     *
+     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_CLDILATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
new file mode 100644
index 0000000000..d7182756b5
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__
+#define __ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__
+
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/runtime/CL/CLDistribution1D.h"
+#include "arm_compute/runtime/CL/CLLut.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute histogram equalization. This function calls the following CL kernels:
+ *
+ * -# @ref CLHistogramKernel
+ * -# @ref CLTableLookupKernel
+ *
+ */
+class CLEqualizeHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLEqualizeHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input  Input image. Data types supported: U8.
+     * @param[out] output Output of same data type with equalized brightness and contrast.
+     */
+    void configure(const ICLImage *input, ICLImage *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHistogramKernel       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    CLHistogramBorderKernel _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
+    CLTableLookupKernel     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    CLDistribution1D        _hist;                    /**< Distribution that holds the histogram of the input image. */
+    CLDistribution1D        _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    CLLut                   _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static const uint32_t   max_range = 256;          /**< Histogram range of the internal histograms. */
+    static const uint32_t   nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
+};
+}
+#endif /*__ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
new file mode 100644
index 0000000000..cd2f5516e2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLErode.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLERODE_H__
+#define __ARM_COMPUTE_CLERODE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute erode. This function calls the following OpenCL kernels:
+*
+* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+* -# @ref CLErodeKernel
+*
+*/
+class CLErode : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode
+     *
+     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_CLERODE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
new file mode 100644
index 0000000000..79d82af462
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFastCorners.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFASTCORNERS_H__
+#define __ARM_COMPUTE_CLFASTCORNERS_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute fast corners. This function calls the following CL kernels:
+ *
+ * -# @ref CLFastCornersKernel
+ * -# @ref CLNonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
+ * -# @ref CLCopyToArrayKernel
+ *
+ */
+class CLFastCorners : public IFunction
+{
+public:
+    /** Constructor */
+    CLFastCorners();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCorners(const CLFastCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLFastCorners &operator=(const CLFastCorners &) = delete;
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in]     input                 Source image. Data types supported: U8.
+     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
+     * @param[out]    corners               Array of keypoints to store the results.
+     * @param[in,out] num_corners           Record number of corners in the array
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *corners, unsigned int *num_corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLFastCornersKernel       _fast_corners_kernel;
+    CLNonMaximaSuppression3x3 _suppr_func;
+    CLCopyToArrayKernel       _copy_array_kernel;
+    CLImage                   _output;
+    CLImage                   _suppr;
+    Window                    _win;
+    bool                      _non_max;
+    unsigned int             *_num_corners;
+    cl::Buffer                _num_buffer;
+    CLKeyPointArray          *_corners;
+    uint8_t                   _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_CLFASTCORNERS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
new file mode 100644
index 0000000000..b4855475c3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFillBorder.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFILLBORDER_H__
+#define __ARM_COMPUTE_CLFILLBORDER_H__
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLFillBorderKernel */
+class CLFillBorder : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in,out] tensor                Source tensor. Data types supported: U8, S16
+     * @param[in]     border_width          The border width
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+};
+}
+#endif /*__ARM_COMPUTE_FILLBORDER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
new file mode 100644
index 0000000000..826f445bd8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
+ *
+ *  -# @ref CLTransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTransposeKernel        _transpose_kernel;
+    CLGEMMTranspose1xWKernel _transpose1xW_kernel;
+    CLTensor                 _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once)
+ *  -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
+ *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input                Source tensor. Data type supported: F16/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    void configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+
+    CLIm2ColKernel                      _im2col_kernel;
+    CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    CLGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    CLGEMMMatrixMultiplyKernel          _mm_kernel;
+    CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    CLTensor                            _im2col_output;
+    CLTensor                            _interleave4x4_output;
+    CLTensor                            _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
+};
+}
+#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
new file mode 100644
index 0000000000..043b2b8115
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMM_H__
+#define __ARM_COMPUTE_CLGEMM_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute GEMM on OpenCL. Data types supported: F32, F16. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref CLGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
+ *  -# @ref CLGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ *
+ */
+class CLGEMM : public IFunction
+{
+public:
+    /** Default constructor. */
+    CLGEMM();
+    /** Initialise the kernel's inputs and output
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     *
+     * @note All tensors must have the same data type. Data types supported: F32, F16
+     *
+     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+     *
+     * @param[in]  a      First input tensor  (Matrix or Vector A). Data types supported: F32, F16
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a.
+     * @param[in]  c      Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+     * @param[out] output Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha  Weight of the matrix product
+     * @param[in]  beta   Weight of matrix C
+     */
+    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLGEMMInterleave4x4Kernel  _interleave_kernel;
+    CLGEMMTranspose1xWKernel   _transpose_kernel;
+    CLGEMMMatrixMultiplyKernel _mm_kernel;
+    CLGEMMMatrixAdditionKernel _ma_kernel;
+    CLTensor                   _tmp_a;
+    CLTensor                   _tmp_b;
+    bool                       _run_vector_matrix_multiplication;
+    bool                       _run_addition;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
new file mode 100644
index 0000000000..b80136b328
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
+#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute CLGEMMInterleave4x4Kernel. This function calls the following OpenCL kernel:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel
+ *
+ */
+class CLGEMMInterleave4x4 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
new file mode 100644
index 0000000000..da8883c3f8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__
+#define __ARM_COMPUTE_CLGEMMLOWP_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels:
+*
+*  -# @ref CLGEMMInterleave4x4Kernel
+*  -# @ref CLGEMMTranspose1xWKernel
+*  -# @ref CLGEMMLowpMatrixMultiplyKernel
+*
+*/
+class CLGEMMLowp : public IFunction
+{
+public:
+    /** Constructor */
+    CLGEMMLowp();
+    /** Initialise the kernel's inputs, output
+    *
+    * @note GEMM_LOWP:  low precision matrix multiply kernel
+    *  This kernel performs the following computation:
+    *
+    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+    *  -# Compute the int32 matrix product of the resulting a * b.
+    *  -# Add output_offset to each entry of the result.
+    *  -# Multiply each entry of the result and round to the nearest integer
+    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+    *
+    * @param[in]  a               First input tensor  (Matrix A). Data types supported: U8.
+    * @param[in]  b               Second input tensor (Matrix B). Data types supported: same as @p a.
+    * @param[out] output          Output tensor. Data types supported: same as @p a.
+    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+    * @param[in]  output_offset   Offset to be added to each element of the output matrix
+    * @param[in]  output_mult_int Multiplied with each element of the output matrix
+    * @param[in]  shift           Number of bits to shift right the result.
+    */
+    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLGEMMInterleave4x4Kernel      _interleave_kernel;
+    CLGEMMTranspose1xWKernel       _transpose_kernel;
+    CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
+    CLTensor                       _tmp_a;
+    CLTensor                       _tmp_b;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
new file mode 100644
index 0000000000..f8223bc5f5
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN3X3_H__
+#define __ARM_COMPUTE_CLGAUSSIAN3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian3x3Kernel
+ *
+ */
+class CLGaussian3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
new file mode 100644
index 0000000000..148b9a9924
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN5X5_H__
+#define __ARM_COMPUTE_CLGAUSSIAN5X5_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian5x5HorKernel
+ * -# @ref CLGaussian5x5VertKernel
+ *
+ */
+class CLGaussian5x5 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLGaussian5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLGaussian5x5HorKernel  _kernel_hor;     /**< Horizontal pass kernel */
+    CLGaussian5x5VertKernel _kernel_vert;    /**< Vertical pass kernel */
+    CLFillBorderKernel      _border_handler; /**< Kernel to handle image borders */
+    CLImage                 _tmp;            /**< Temporary buffer */
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN5X5_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
new file mode 100644
index 0000000000..97935193dc
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIANPYRAMID_H__
+#define __ARM_COMPUTE_CLGAUSSIANPYRAMID_H__
+
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Common interface for all Gaussian pyramid functions
+ */
+class CLGaussianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    CLGaussianPyramid();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramid(const CLGaussianPyramid &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramid &operator=(const CLGaussianPyramid &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramid(CLGaussianPyramid &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default;
+    /** Default destructor */
+    virtual ~CLGaussianPyramid() = default;
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    virtual void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
+
+protected:
+    ICLTensor *_input;
+    CLPyramid *_pyramid;
+    CLPyramid  _tmp;
+};
+
+/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussianPyramidHorKernel
+ * -# @ref CLGaussianPyramidVertKernel
+ */
+class CLGaussianPyramidHalf : public CLGaussianPyramid
+{
+public:
+    /** Constructor */
+    CLGaussianPyramidHalf();
+
+    // Inherited methods overridden:
+    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<CLFillBorderKernel[]>          _border_handler;
+    std::unique_ptr<CLGaussianPyramidHorKernel[]>  _horizontal_reduction;
+    std::unique_ptr<CLGaussianPyramidVertKernel[]> _vertical_reduction;
+};
+
+/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian5x5
+ * -# @ref CLScaleKernel
+ */
+class CLGaussianPyramidOrb : public CLGaussianPyramid
+{
+public:
+    /** Constructor */
+    CLGaussianPyramidOrb();
+
+    // Inherited methods overridden:
+    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<CLGaussian5x5[]> _gauss5x5;
+    std::unique_ptr<CLScaleKernel[]> _scale_nearest;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
new file mode 100644
index 0000000000..cdb23bff33
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ *
+ */
+class CLHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                 _gradient;
+    CLHOGOrientationBinningKernel _orient_bin;
+    CLHOGBlockNormalizationKernel _block_norm;
+    CLTensor                      _mag;
+    CLTensor                      _phase;
+    CLTensor                      _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
new file mode 100644
index 0000000000..0b4fad7766
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTOR_H__
+#define __ARM_COMPUTE_CLHOGDETECTOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
+ *
+ * -# @ref CLHOGDetectorKernel
+ *
+ */
+class CLHOGDetector : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDetector();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector(const CLHOGDetector &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector(CLHOGDetector &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector &operator=(CLHOGDetector &&) = default;
+    /** Default destructor */
+    ~CLHOGDetector() = default;
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHOGDetectorKernel      _hog_detector_kernel;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer               _num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
new file mode 100644
index 0000000000..e74a68497f
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGGRADIENT_H__
+#define __ARM_COMPUTE_CLHOGGRADIENT_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLDerivative
+ * -# @ref CLMagnitudePhaseKernel
+ *
+ */
+class CLHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLDerivative           _derivative;
+    CLMagnitudePhaseKernel _mag_phase;
+    CLTensor               _gx;
+    CLTensor               _gy;
+};
+}
+#endif /*__ARM_COMPUTE_CLHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
new file mode 100644
index 0000000000..3fe0fa932a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ * -# @ref CLHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class CLHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<CLHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<CLTensor[]>                                   _hog_space;
+    std::unique_ptr<CLTensor[]>                                   _hog_norm_space;
+    ICLDetectionWindowArray                                      *_detection_windows;
+    CLTensor                                                      _mag;
+    CLTensor                                                      _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGMULTIDETECTION_H__ */
\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
new file mode 100644
index 0000000000..90da687435
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHARRISCORNERS_H__
+#define __ARM_COMPUTE_CLHARRISCORNERS_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute harris corners detection. This function calls the following CL and NEON kernels and functions:
+ *
+ * @note Requires CPU support for the kernels: CPPCornerCandidatesKernel and CPPSortEuclideanDistanceKernel.
+ *
+ * -# @ref CLSobel3x3 (if gradient_size == 3) or<br/>
+ *    @ref CLSobel5x5 (if gradient_size == 5) or<br/>
+ *    @ref CLSobel7x7 (if gradient_size == 7)
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLHarrisScoreKernel
+ * -# @ref CLNonMaximaSuppression3x3
+ * -# @ref CPPCornerCandidatesKernel
+ * -# @ref CPPSortEuclideanDistanceKernel
+ */
+class CLHarrisCorners : public IFunction
+{
+public:
+    /** Constructor */
+    CLHarrisCorners();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisCorners(const CLHarrisCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete;
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
+     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
+     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
+     * @param[out]    corners               Array of keypoints to store the results.
+     * @param[in]     border_mode           Border mode to use
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLImage *input, float threshold, float min_dist, float sensitivity,
+                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>          _sobel;                 /**< Sobel function */
+    CLHarrisScoreKernel                 _harris_score;          /**< Harris score kernel */
+    CLNonMaximaSuppression3x3Kernel     _non_max_suppr;         /**< Non-maxima suppression function */
+    CPPCornerCandidatesKernel           _candidates;            /**< Sort kernel */
+    CPPSortEuclideanDistanceKernel      _sort_euclidean;        /**< Euclidean distance kernel */
+    CLFillBorderKernel                  _border_gx;             /**< Border handler before running harris score */
+    CLFillBorderKernel                  _border_gy;             /**< Border handler before running harris score */
+    CLImage                             _gx;                    /**< Source image - Gx component */
+    CLImage                             _gy;                    /**< Source image - Gy component */
+    CLImage                             _score;                 /**< Source image - Harris score */
+    CLImage                             _nonmax;                /**< Source image - Non-Maxima suppressed image */
+    std::unique_ptr<InternalKeypoint[]> _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
+    int32_t                             _num_corner_candidates; /**< Number of potential corner candidates */
+    ICLKeyPointArray                   *_corners;               /**< Output corners array */
+};
+}
+#endif /*__ARM_COMPUTE_CLHARRISCORNERS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
new file mode 100644
index 0000000000..455b61812d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHistogram.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHISTOGRAM_H__
+#define __ARM_COMPUTE_CLHISTOGRAM_H__
+
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLDistribution1D;
+class ICLTensor;
+using ICLTensor = ICLImage;
+
+/** Basic function to execute histogram. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLHistogramKernel
+ *  -# @ref CLHistogramBorderKernel
+ *
+ */
+class CLHistogram : public IFunction
+{
+public:
+    /*
+     * @ Default constructor
+     */
+    CLHistogram();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogram(const CLHistogram &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLHistogram &operator=(const CLHistogram &) = delete;
+    /** Initialize the function
+     *
+     * @param[in]  input  Source image. Data types supported: U8
+     * @param[out] output Output distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHistogramKernel       _kernel;        /**< kernel to run */
+    CLHistogramBorderKernel _kernel_border; /**< Border kernel to run */
+};
+}
+#endif /*__ARM_COMPUTE_CLHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
new file mode 100644
index 0000000000..25fc549b29
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINTEGRALIMAGE_H__
+#define __ARM_COMPUTE_CLINTEGRALIMAGE_H__
+
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute integral image. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLIntegralImageHorKernel
+ * -# @ref CLIntegralImageVertKernel
+ *
+ */
+class CLIntegralImage : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLIntegralImage();
+    /** Initialise the function's source, destinations and border mode.
+    *
+    * @param[in]  input  Source tensor. Data types supported: U8.
+    * @param[out] output Destination tensor, Data types supported: U32.
+    */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLIntegralImageHorKernel  _integral_hor;  /**< Integral Image Horizontal kernel */
+    CLIntegralImageVertKernel _integral_vert; /**< Integral Image Vertical kernel */
+};
+}
+#endif /*__ARM_COMPUTE_CLINTEGRALIMAGE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
new file mode 100644
index 0000000000..0c6708aa73
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLAPLACIANPYRAMID_H__
+#define __ARM_COMPUTE_CLLAPLACIANPYRAMID_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute laplacian pyramid. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLGaussianPyramidHalf
+ * -# @ref CLGaussian5x5
+ * -# @ref CLArithmeticSubtraction
+ *
+ *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
+ *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
+ *  L(i) = I(i) - Gaussian5x5(I(i))
+ *  Level 0 has always the same first two dimensions as the input tensor.
+*/
+class CLLaplacianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    CLLaplacianPyramid();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data types supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
+     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
+     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
+     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    size_t                                     _num_levels;
+    CLGaussianPyramidHalf                      _gaussian_pyr_function;
+    std::unique_ptr<CLGaussian5x5[]>           _convf;
+    std::unique_ptr<CLArithmeticSubtraction[]> _subf;
+    CLDepthConvert                             _depth_function;
+    CLPyramid                                  _gauss_pyr;
+    CLPyramid                                  _conv_pyr;
+};
+}
+#endif /*__ARM_COMPUTE_CLLAPLACIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
new file mode 100644
index 0000000000..4bc7eb65ce
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__
+#define __ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute laplacian reconstruction. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLArithmeticAddition
+ * -# @ref CLScale
+ * -# @ref CLDepthConvert
+ *
+ * This function reconstructs the original image from a Laplacian Image Pyramid.
+ *
+ *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
+ *  resolution of the next pyramid level.
+ *
+ *  I(n-2) = upsample( input + L(n-1)
+ *
+ *  For each pyramid level i, except i=0 and i=n-1:
+ *  I(i-1) = upsample(I(i) + L(i))
+ *
+ *  output = I(0) + L(0)
+*/
+class CLLaplacianReconstruct : public IFunction
+{
+public:
+    /** Constructor */
+    CLLaplacianReconstruct();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * The Output image must have the same size as the first level of the pyramid.
+     * The Input image must have the same size as the last level of the pyramid.
+     *
+     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
+     *
+     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
+     * @param[in]  input                 Source tensor. Data types supported: S16.
+     * @param[out] output                Output tensor. Data types supported: U8.
+     * @param[in]  border_mode           Border mode to use for the convolution.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLPyramid                               _tmp_pyr;
+    std::unique_ptr<CLArithmeticAddition[]> _addf;
+    std::unique_ptr<CLScale[]>              _scalef;
+    CLDepthConvert                          _depthf;
+};
+}
+#endif /*__ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
new file mode 100644
index 0000000000..b4e469196e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLLocallyConnectedMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLLocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLIm2ColKernel                              _input_im2col_kernel;
+    CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLLocallyConnectedMatrixMultiplyKernel      _mm_kernel;
+    CLCol2ImKernel                              _output_col2im_kernel;
+    CLTensor                                    _input_im2col_reshaped;
+    CLTensor                                    _weights_reshaped;
+    CLTensor                                    _gemm_output;
+    bool                                        _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
new file mode 100644
index 0000000000..dc5f9139b3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMagnitude.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMAGNITUDE_H__
+#define __ARM_COMPUTE_CLMAGNITUDE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLMagnitudePhaseKernel. */
+class CLMagnitude : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input1   First tensor input. Data types supported: S16.
+     * @param[in]  input2   Second tensor input. Data types supported: S16.
+     * @param[out] output   Output tensor. Data types supported: S16.
+     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
+};
+}
+#endif /*__ARM_COMPUTE_CLMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
new file mode 100644
index 0000000000..e33bcdd779
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEANSTDDEV_H__
+#define __ARM_COMPUTE_CLMEANSTDDEV_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */
+class CLMeanStdDev : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLMeanStdDev();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input  Input image. Data types supported: U8.
+     * @param[out] mean   Output average pixel value.
+     * @param[out] stddev (Optional)Output standard deviation of pixel values.
+     */
+    void configure(const ICLImage *input, float *mean, float *stddev = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
+    cl::Buffer         _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
+    cl::Buffer         _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+};
+}
+#endif /*__ARM_COMPUTE_CLMEANSTDDEV_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
new file mode 100644
index 0000000000..af84ba7289
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEDIAN3X3_H__
+#define __ARM_COMPUTE_CLMEDIAN3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute median filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLMedian3x3Kernel
+ *
+ */
+class CLMedian3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLMEDIAN3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
new file mode 100644
index 0000000000..84fd67515b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMINMAXLOCATION_H__
+#define __ARM_COMPUTE_CLMINMAXLOCATION_H__
+
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute min and max location. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLMinMaxKernel
+ * -# @ref CLMinMaxLocationKernel
+ */
+class CLMinMaxLocation : public IFunction
+{
+public:
+    /** Constructor */
+    CLMinMaxLocation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocation(const CLMinMaxLocation &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocation &operator=(const CLMinMaxLocation &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocation(CLMinMaxLocation &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default;
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  input     Input image. Data types supported: U8 or S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc   (Optional) Array of Coordinates2D used to store maximum value locations.
+     * @param[out] min_count (Optional) Number of minimum value encounters.
+     * @param[out] max_count (Optional) Number of maximum value encounters.
+     */
+    void configure(const ICLImage *input, int32_t *min, int32_t *max,
+                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMinMaxKernel         _min_max_kernel;     /**< Kernel that performs min/max */
+    CLMinMaxLocationKernel _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
+    cl::Buffer             _min_max_vals;       /**< Buffer to collect min, max values */
+    cl::Buffer             _min_max_count_vals; /**< Buffer to collect min, max values */
+    int32_t               *_min;                /**< Minimum value. */
+    int32_t               *_max;                /**< Maximum value. */
+    uint32_t              *_min_count;          /**< Minimum value occurrences. */
+    uint32_t              *_max_count;          /**< Maximum value occurrences. */
+    CLCoordinates2DArray *_min_loc;             /**< Minimum value occurrences coordinates. */
+    CLCoordinates2DArray *_max_loc;             /**< Maximum value occurrences  coordinates. */
+};
+}
+#endif /*__ARM_COMPUTE_CLMINMAXLOCATION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
new file mode 100644
index 0000000000..9eee33e0ba
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONLINEARFILTER_H__
+#define __ARM_COMPUTE_CLNONLINEARFILTER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute non linear filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLNonLinearFilterKernel
+ *
+ * @note Supported mask dimensions squares of sizes 3, 5
+ */
+class CLNonLinearFilter : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8
+     * @param[in]     function              Non linear function to perform
+     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
+     * @param[in]     pattern               Mask pattern
+     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLNONLINEARFILTER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
new file mode 100644
index 0000000000..7adced4313
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__
+#define __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLNonMaximaSuppression3x3Kernel
+ */
+class CLNonMaximaSuppression3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *       The constant values used with CONSTANT border mode is 0
+     *
+     * @param[in,out] input       Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output      Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
+     * @param[in]     border_mode Border mode to use for non-maxima suppression.
+     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode);
+};
+}
+#endif /* __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
new file mode 100644
index 0000000000..a4dae85c1d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to simulate a normalization layer. This function calls the following CL kernels:
+ *
+ * -# @ref CLPixelWiseMultiplicationKernel
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLNormalizationLayerKernel
+ *
+ */
+class CLNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTensor                        _squared_input;   /**< The intermediate buffer which stores results of squaring input*/
+    CLNormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel to run */
+    CLPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel to run */
+    CLFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
+};
+}
+#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
new file mode 100644
index 0000000000..ca3f86100e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLOPTICALFLOW_H__
+#define __ARM_COMPUTE_CLOPTICALFLOW_H__
+
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class CLPyramid;
+
+using CLLKInternalKeypointArray = CLArray<CLLKInternalKeypoint>;
+using CLCoefficientTableArray   = CLArray<CLCoefficientTable>;
+using CLOldValueArray           = CLArray<CLOldValue>;
+
+/** Basic function to execute optical flow. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLScharr3x3
+ * -# @ref CLLKTrackerInitKernel
+ * -# @ref CLLKTrackerStage0Kernel
+ * -# @ref CLLKTrackerStage1Kernel
+ * -# @ref CLLKTrackerFinalizeKernel
+ */
+class CLOpticalFlow : public IFunction
+{
+public:
+    /** Default constructor */
+    CLOpticalFlow();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLOpticalFlow(const CLOpticalFlow &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLOpticalFlow &operator=(const CLOpticalFlow &) = delete;
+    /** Allow instances of this class to be moved */
+    CLOpticalFlow(CLOpticalFlow &&) = default;
+    /** Allow instances of this class to be moved */
+    CLOpticalFlow &operator=(CLOpticalFlow &&) = default;
+    /**  Initialise the function input and output
+     *
+     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
+     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
+     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
+     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
+     * @param[in]  termination           The criteria to terminate the search of each keypoint.
+     * @param[in]  epsilon               The error for terminating the algorithm
+     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
+     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
+     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
+     * @param[in]  border_mode           The border mode applied at scharr kernel stage
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
+     *
+     */
+    void configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<CLLKTrackerInitKernel[]>   _tracker_init_kernel;
+    std::unique_ptr<CLLKTrackerStage0Kernel[]> _tracker_stage0_kernel;
+    std::unique_ptr<CLLKTrackerStage1Kernel[]> _tracker_stage1_kernel;
+    CLLKTrackerFinalizeKernel                  _tracker_finalize_kernel;
+    std::unique_ptr<CLScharr3x3[]>             _func_scharr;
+    std::unique_ptr<CLTensor[]>                _scharr_gx;
+    std::unique_ptr<CLTensor[]>                _scharr_gy;
+    const ICLKeyPointArray                    *_old_points;
+    const ICLKeyPointArray                    *_new_points_estimates;
+    ICLKeyPointArray                          *_new_points;
+    std::unique_ptr<CLLKInternalKeypointArray> _old_points_internal;
+    std::unique_ptr<CLLKInternalKeypointArray> _new_points_internal;
+    std::unique_ptr<CLCoefficientTableArray>   _coefficient_table;
+    std::unique_ptr<CLOldValueArray>           _old_values;
+    size_t                                     _num_levels;
+};
+}
+#endif /*__ARM_COMPUTE_CLOPTICALFLOW_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
new file mode 100644
index 0000000000..7cdfab16e2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPhase.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPHASE_H__
+#define __ARM_COMPUTE_CLPHASE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute an @ref CLMagnitudePhaseKernel. */
+class CLPhase : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  input1     First tensor input. Data types supported: S16.
+     * @param[in]  input2     Second tensor input. Data types supported: S16.
+     * @param[out] output     Output tensor. Data types supported: U8.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
+};
+}
+#endif /*__ARM_COMPUTE_CLPHASE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
new file mode 100644
index 0000000000..71754fc3f4
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__
+#define __ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
+class CLPixelWiseMultiplication : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1          First tensor input. Data types supported: U8, S16, F16 or F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8, S16, F16 or F32.
+     * @param[out] output          Output tensor. Data types supported: U8(Only if both inputs are U8), S16, F16 or F32.
+     * @param[in]  scale           Scale to apply after multiplication. Must be positive.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
new file mode 100644
index 0000000000..f92860e5b2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPOOLINGLAYER_H__
+#define __ARM_COMPUTE_CLPOOLINGLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref CLPoolingLayerKernel
+ */
+class CLPoolingLayer : public ICLSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16, F32.
+     * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
new file mode 100644
index 0000000000..4cb2be90e7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLRemap.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLREMAP_H__
+#define __ARM_COMPUTE_CLREMAP_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute remap. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLRemapKernel
+ */
+class CLRemap : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
+     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
+     * @param[in]     border_mode           Border mode to use on the input tensor.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
+                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLREMAP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
new file mode 100644
index 0000000000..c2438ddf9b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCALE_H__
+#define __ARM_COMPUTE_CLSCALE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLScaleKernel */
+class CLScale : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8, S16. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSCALE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
new file mode 100644
index 0000000000..3ea0b84624
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHARR3X3_H__
+#define __ARM_COMPUTE_CLSCHARR3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLScharr3x3Kernel
+ *
+ */
+class CLScharr3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSCHARR3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
new file mode 100644
index 0000000000..7a4f47d0ed
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL3X3_H__
+#define __ARM_COMPUTE_CLSOBEL3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel3x3Kernel
+ *
+ */
+class CLSobel3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
new file mode 100644
index 0000000000..ad1f72faf8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL5X5_H__
+#define __ARM_COMPUTE_CLSOBEL5X5_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel5x5HorKernel
+ * -# @ref CLSobel5x5VertKernel
+ *
+ */
+class CLSobel5x5 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLSobel5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLSobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    CLSobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
+    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL5X5_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
new file mode 100644
index 0000000000..1a3fe1a50a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL7X7_H__
+#define __ARM_COMPUTE_CLSOBEL7X7_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel7x7HorKernel
+ * -# @ref CLSobel7x7VertKernel
+ *
+ */
+class CLSobel7x7 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLSobel7x7();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
+     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLSobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    CLSobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
+    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL7X7_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
new file mode 100644
index 0000000000..42cfc06fc4
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOFTMAXLAYER_H__
+#define __ARM_COMPUTE_CLSOFTMAXLAYER_H__
+
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute a SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f]
+ *
+ * This function runs the following kernels:
+ * -# @ref CLLogits1DMaxKernel
+ * -# @ref CLLogits1DShiftExpSumKernel
+ * -# @ref CLLogits1DNormKernel
+ */
+class CLSoftmaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLSoftmaxLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLLogits1DMaxKernel         _max_kernel;
+    CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
+    CLLogits1DNormKernel        _norm_kernel;
+    CLTensor                    _max;
+    CLTensor                    _sum;
+    CLTensor                    _tmp;
+};
+}
+#endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
new file mode 100644
index 0000000000..ebe6593b6a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLTableLookup.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class ICLLut;
+
+/** Basic function to run @ref CLTableLookupKernel */
+class CLTableLookup : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  First tensor input. Data types supported: U8 and S16
+     * @param[in]  lut    Input lookup table. Data types supported: U8 and S16
+     * @param[out] output Output tensor. Data types supported: U8 and S16
+     */
+    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLTABLELOOKUP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
new file mode 100644
index 0000000000..14c05786c1
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLThreshold.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTHRESHOLD_H__
+#define __ARM_COMPUTE_CLTHRESHOLD_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLThresholdKernel */
+class CLThreshold : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
+     * @param[in]  input       First tensor input. Data types supported: U8.
+     * @param[out] output      Output tensor. Data types supported: U8.
+     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold.
+     * @param[in]  false_value Value to assign when the condition is false.
+     * @param[in]  true_value  value to assign when the condition is true.
+     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
+     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                   uint8_t false_value = 0, uint8_t true_value = 0,
+                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLTHRESHOLD_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
new file mode 100644
index 0000000000..9b57fe00a8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSE_H__
+#define __ARM_COMPUTE_CLTRANSPOSE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to transpose a matrix on OpenCL. This function calls the following OpenCL kernel:
+ *
+ *  -# @ref CLTransposeKernel
+ *
+ */
+class CLTranspose : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_CLTRANSPOSE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
new file mode 100644
index 0000000000..aeab3f7b22
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPAFFINE_H__
+#define __ARM_COMPUTE_CLWARPAFFINE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation */
+class CLWarpAffine : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPAFFINE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
new file mode 100644
index 0000000000..80237017aa
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPPERSPECTIVE_H__
+#define __ARM_COMPUTE_CLWARPPERSPECTIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation */
+class CLWarpPerspective : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8.
+     * @param[in]     matrix                The perspective matrix. Must be 2x3 of type float.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPPERSPECTIVE_H__ */
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
new file mode 100644
index 0000000000..7a37e5ec21
--- /dev/null
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPSCHEDULER_H__
+#define __ARM_COMPUTE_CPPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class Thread;
+
+/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads. */
+class CPPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the CPPScheduler has in his pool.
+     *
+     * @return Number of threads available in CPPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static CPPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor: create a pool of threads. */
+    CPPScheduler();
+
+    unsigned int _num_threads;
+    std::unique_ptr<Thread[], void (*)(Thread *)> _threads;
+};
+}
+#endif /* __ARM_COMPUTE_CPPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Distribution1D.h b/arm_compute/runtime/Distribution1D.h
new file mode 100644
index 0000000000..7080e88075
--- /dev/null
+++ b/arm_compute/runtime/Distribution1D.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DISTRIBUTION1D_H__
+#define __ARM_COMPUTE_DISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution1D.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the 1D distribution interface */
+class Distribution1D : public IDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    Distribution1D(size_t num_bins, int32_t offset, uint32_t range);
+
+    // Inherited methods overridden:
+    uint32_t *buffer() const override;
+
+private:
+    std::unique_ptr<uint32_t[]> _data; /**< The distribution data. */
+};
+}
+#endif /* __ARM_COMPUTE_DISTRIBUTION1D_H__ */
diff --git a/arm_compute/runtime/HOG.h b/arm_compute/runtime/HOG.h
new file mode 100644
index 0000000000..70d8034bef
--- /dev/null
+++ b/arm_compute/runtime/HOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HOG_H__
+#define __ARM_COMPUTE_HOG_H__
+
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** CPU implementation of HOG data-object */
+class HOG : public IHOG
+{
+public:
+    /** Default constructor */
+    HOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    // Inherited method overridden:
+    const HOGInfo *info() const override;
+    float         *descriptor() const override;
+
+private:
+    HOGInfo                  _info;
+    std::unique_ptr<float[]> _descriptor;
+};
+}
+#endif /* __ARM_COMPUTE_HOG_H__ */
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
new file mode 100644
index 0000000000..a4e7ed15e0
--- /dev/null
+++ b/arm_compute/runtime/IFunction.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IFUNCTION_H__
+#define __ARM_COMPUTE_IFUNCTION_H__
+
+namespace arm_compute
+{
+/** Base class for all functions */
+class IFunction
+{
+public:
+    /** Run the kernels contained in the function
+     *
+     * For NEON kernels:
+     * - Multi-threading is used for the kernels which are parallelisable.
+     * - By default std::thread::hardware_concurrency() threads are used.
+     *
+     * @note @ref CPPScheduler::set_num_threads() can be used to manually set the number of threads
+     *
+     * For OpenCL kernels:
+     * - All the kernels are enqueued on the queue associated with CLScheduler.
+     * - The queue is then flushed.
+     *
+     * @note The function will not block until the kernels are executed. It is the user's responsibility to wait.
+     */
+    virtual void run() = 0;
+    /** Destructor
+     *
+     */
+    virtual ~IFunction() = default;
+};
+}
+#endif /*__ARM_COMPUTE_IFUNCTION_H__ */
diff --git a/arm_compute/runtime/ILutAllocator.h b/arm_compute/runtime/ILutAllocator.h
new file mode 100644
index 0000000000..f23fbd2154
--- /dev/null
+++ b/arm_compute/runtime/ILutAllocator.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ILUTALLOCATOR_H__
+#define __ARM_COMPUTE_ILUTALLOCATOR_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic interface to allocate LUTs' */
+class ILutAllocator
+{
+public:
+    /** Default constructor */
+    ILutAllocator();
+    /** Default virtual destructor */
+    virtual ~ILutAllocator() = default;
+    /** Allow instances of this class to be move constructed */
+    ILutAllocator(ILutAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    ILutAllocator &operator=(ILutAllocator &&) = default;
+    /** Allocate an LUT of the requested number of elements and data_type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    void init(size_t num_elements, DataType data_type);
+    /** Returns the total number of elements in the LUT.
+     *
+     * @return Total number of elements.
+     */
+    size_t num_elements() const;
+    /** Returns the type of the LUT.
+     *
+     * @return The type of the LUT.
+     */
+    DataType type() const;
+    /** Returns the total size in bytes of the LUT.
+     *
+     * @return Total size of the LUT in bytes.
+     */
+    size_t size() const;
+
+protected:
+    /** Interface to be implemented by the child class to allocate the LUT. */
+    virtual void allocate() = 0;
+    /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
+     *
+     * @return Pointer to a CPU mapping of the memory
+     */
+    virtual uint8_t *lock() = 0;
+    /** Interface to be implemented by the child class to unlock the memory allocation after the CPU is done accessing it. */
+    virtual void unlock() = 0;
+
+private:
+    size_t   _num_elements; /**< Number of elements allocated */
+    DataType _data_type;    /**< Data type of LUT elements. */
+};
+}
+#endif /* __ARM_COMPUTE_ILUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
new file mode 100644
index 0000000000..39c027c6b7
--- /dev/null
+++ b/arm_compute/runtime/IScheduler.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ISCHEDULER_H__
+#define __ARM_COMPUTE_ISCHEDULER_H__
+
+namespace arm_compute
+{
+class ICPPKernel;
+
+/** Scheduler interface to run kernels */
+class IScheduler
+{
+public:
+    /** Destructor. */
+    virtual ~IScheduler() = default;
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     */
+    virtual void set_num_threads(unsigned int num_threads) = 0;
+    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    virtual unsigned int num_threads() const = 0;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
new file mode 100644
index 0000000000..6103e436bc
--- /dev/null
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSORALLOCATOR_H__
+#define __ARM_COMPUTE_ITENSORALLOCATOR_H__
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface to allocate tensors */
+class ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    ITensorAllocator();
+    /** Allow instances of this class to be copy constructed */
+    ITensorAllocator(const ITensorAllocator &) = default;
+    /** Allow instances of this class to be copied */
+    ITensorAllocator &operator=(const ITensorAllocator &) = default;
+    /** Allow instances of this class to be move constructed */
+    ITensorAllocator(ITensorAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    ITensorAllocator &operator=(ITensorAllocator &&) = default;
+    /** Default virtual destructor. */
+    virtual ~ITensorAllocator() = default;
+
+    /** Initialize a tensor based on the passed @ref TensorInfo.
+     *
+     * @param[in] input TensorInfo object containing the description of the tensor to initialize.
+     */
+    void init(const TensorInfo &input);
+    /** Return a reference to the tensor's metadata
+     *
+     * @return Reference to the tensor's metadata.
+     */
+    TensorInfo &info();
+    /** Return a constant reference to the tensor's metadata
+     *
+     * @return Constant reference to the tensor's metadata.
+     */
+    const TensorInfo &info() const;
+
+    /** Interface to be implemented by the child class to allocate the tensor.
+     *
+     * @note The child is expected to use the TensorInfo to get the size of the memory allocation.
+     * @warning The tensor must not already be allocated. Otherwise calling the function will fail.
+     */
+    virtual void allocate() = 0;
+
+    /** Interface to be implemented by the child class to free the allocated tensor.
+     *
+     * @warning The tensor must have been allocated previously. Otherwise calling the function will fail.
+     */
+    virtual void free() = 0;
+
+protected:
+    /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
+     *
+     * @return Pointer to a CPU mapping of the memory
+     */
+    virtual uint8_t *lock() = 0;
+    /** Interface to be implemented by the child class to unlock the memory allocation after the CPU is done accessing it. */
+    virtual void unlock() = 0;
+
+private:
+    TensorInfo _info; /**< Tensor's metadata. */
+};
+}
+#endif /*__ARM_COMPUTE_ITENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/Lut.h b/arm_compute/runtime/Lut.h
new file mode 100644
index 0000000000..87431feee4
--- /dev/null
+++ b/arm_compute/runtime/Lut.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LUT_H__
+#define __ARM_COMPUTE_LUT_H__
+
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/LutAllocator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ILutAllocator;
+
+/** Basic implementation of the LUT interface */
+class Lut : public ILut
+{
+public:
+    /** Constructor */
+    Lut();
+    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    Lut(size_t num_elements, DataType data_type);
+    /** Return a pointer to the lut's allocator
+     *
+     * @return A pointer to the lut's allocator
+     */
+    ILutAllocator *allocator();
+
+    // Inherited methods overridden:
+    size_t   num_elements() const override;
+    uint32_t index_offset() const override;
+    size_t   size_in_bytes() const override;
+    DataType type() const override;
+    uint8_t *buffer() const override;
+    void     clear() override;
+
+private:
+    LutAllocator _allocator; /**< Instance of the basic CPU allocator.*/
+};
+}
+#endif /* __ARM_COMPUTE_LUT_H__ */
diff --git a/arm_compute/runtime/LutAllocator.h b/arm_compute/runtime/LutAllocator.h
new file mode 100644
index 0000000000..76b596bfa0
--- /dev/null
+++ b/arm_compute/runtime/LutAllocator.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LUTALLOCATOR_H__
+#define __ARM_COMPUTE_LUTALLOCATOR_H__
+
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of a CPU memory LUT allocator. */
+class LutAllocator : public ILutAllocator
+{
+public:
+    /** Default constructor. */
+    LutAllocator();
+    /** Interface to be implemented by the child class to return the pointer to the allocate data. */
+    uint8_t *data() const;
+
+protected:
+    /** Allocate num_elements() * sizeof(type()) of CPU memory. */
+    void allocate() override;
+    /** No-op for CPU memory
+     *
+     * @return A pointer to the beginning of the look up table's allocation.
+     */
+    uint8_t *lock() override;
+    /** No-op for CPU memory. */
+    void unlock() override;
+
+private:
+    std::unique_ptr<uint8_t[]> _buffer; /**< CPU memory allocation. */
+};
+}
+#endif /* __ARM_COMPUTE_LUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/runtime/MultiHOG.h
new file mode 100644
index 0000000000..32bad70738
--- /dev/null
+++ b/arm_compute/runtime/MultiHOG.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIHOG_H__
+#define __ARM_COMPUTE_MULTIHOG_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/HOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** CPU implementation of multi HOG data-object */
+class MultiHOG : public IMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    MultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t num_models() const override;
+    IHOG *model(size_t index) override;
+    const IHOG *model(size_t index) const override;
+
+private:
+    size_t                 _num_models;
+    std::unique_ptr<HOG[]> _model;
+};
+}
+
+#endif /* __ARM_COMPUTE_MULTIHOG_H__ */
diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h
new file mode 100644
index 0000000000..917e586ef8
--- /dev/null
+++ b/arm_compute/runtime/MultiImage.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIIMAGE_H__
+#define __ARM_COMPUTE_MULTIIMAGE_H__
+
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Coordinates;
+class ITensor;
+using IImage = ITensor;
+
+/** Basic implementation of the multi-planar image interface */
+class MultiImage : public IMultiImage
+{
+public:
+    /** Constructor */
+    MultiImage();
+    /** Allocate the multi-planar image
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Allocate the multi-planar image
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init_auto_padding(unsigned int width, unsigned int height, Format format);
+    /** Allocated a previously initialised multi image
+     *
+     * @note The multi image must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+    /** Create a subimage from an existing MultiImage.
+     *
+     *  @param[in] image  Image to use backing memory from
+     *  @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes
+     *  @param[in] width  The width of the subimage
+     *  @param[in] height The height of the subimage
+     */
+    void create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height);
+
+    // Inherited methods overridden:
+    const MultiImageInfo *info() const override;
+    Image *plane(unsigned int index) override;
+    const Image *plane(unsigned int index) const override;
+
+private:
+    /** Init the multi-planar image
+     *
+     *  @param[in] width        Width of the whole image
+     *  @param[in] height       Height of the whole image
+     *  @param[in] format       Format of the whole image
+     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     */
+    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
+
+    MultiImageInfo _info;        /** Instance of the multi-planar image's meta data */
+    std::array<Image, 3> _plane; /* Instance Image to hold the planar's information */
+};
+}
+#endif /*__ARM_COMPUTE_MULTIIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
new file mode 100644
index 0000000000..6e000d8fd8
--- /dev/null
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INESIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_INESIMPLEFUNCTION_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single NEON kernel */
+class INESimpleFunction : public IFunction
+{
+public:
+    /** Constructor */
+    INESimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<INEKernel> _kernel;         /**< Kernel to run */
+    NEFillBorderKernel         _border_handler; /**< Kernel to handle image borders */
+};
+}
+#endif /*__ARM_COMPUTE_INESIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
new file mode 100644
index 0000000000..daf76f3a87
--- /dev/null
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONS_H__
+#define __ARM_COMPUTE_NEFUNCTIONS_H__
+
+/* Header regrouping all the NEON functions */
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+#include "arm_compute/runtime/NEON/functions/NEDilate.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
+#include "arm_compute/runtime/NEON/functions/NEErode.h"
+#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
+#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
+#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
+#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
+#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
+#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+#include "arm_compute/runtime/NEON/functions/NEPhase.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
+#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+
+#endif /* __ARM_COMPUTE_NEFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
new file mode 100644
index 0000000000..94c82b2f03
--- /dev/null
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHEDULER_H__
+#define __ARM_COMPUTE_NESCHEDULER_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+using NEScheduler = Scheduler;
+}
+#endif /*__ARM_COMPUTE_NESCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
new file mode 100644
index 0000000000..266a27586a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__
+#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEAbsoluteDifferenceKernel
+ *
+ * @note The image data type for the inputs must be U8 or S16
+ * @note The function calculates the absolute difference also when the 2 inputs have different image data types
+ */
+class NEAbsoluteDifference : public INESimpleFunction
+{
+public:
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
+     * @param[out] output Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEAccumulate.h b/arm_compute/runtime/NEON/functions/NEAccumulate.h
new file mode 100644
index 0000000000..de532c37a0
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAccumulate.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACCUMULATE_H__
+#define __ARM_COMPUTE_NEACCUMULATE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEAccumulateKernel */
+class NEAccumulate : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[out] output Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+
+/** Basic function to run @ref NEAccumulateWeightedKernel */
+class NEAccumulateWeighted : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input    Source tensor. Data type supported: U8.
+     * @param[in]     alpha    The input scalar value with a value input the range of [0, 1.0]
+     * @param[in,out] output   Accumulated tensor. Data type supported: U8.
+     * @param[in]     use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16 = false);
+};
+
+/** Basic function to run @ref NEAccumulateSquaredKernel */
+class NEAccumulateSquared : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input  Source tensor. Data type supported: U8.
+     * @param[in]     shift  The input with a value input the range of [0, 15]
+     * @param[in,out] output Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEACCUMULATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
new file mode 100644
index 0000000000..35366e16fb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACTIVATIONLAYER_H__
+#define __ARM_COMPUTE_NEACTIVATIONLAYER_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEActivationLayerKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class NEActivationLayer : public INESimpleFunction
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor. Data type supported: QS8/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+};
+}
+#endif /* __ARM_COMPUTE_NEACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
new file mode 100644
index 0000000000..8e34e983c7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICADDITION_H__
+#define __ARM_COMPUTE_NEARITHMETICADDITION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEArithmeticAdditionKernel */
+class NEArithmeticAddition : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+};
+}
+#endif /*__ARM_COMPUTE_NEARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
new file mode 100644
index 0000000000..841b5912b9
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__
+#define __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEArithmeticSubtractionKernel */
+class NEArithmeticSubtraction : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
new file mode 100644
index 0000000000..b0b5c122cb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class NEBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+};
+}
+#endif /* __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
new file mode 100644
index 0000000000..0250293e97
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEAND_H__
+#define __ARM_COMPUTE_NEBITWISEAND_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseAndKernel */
+class NEBitwiseAnd : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEAND_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
new file mode 100644
index 0000000000..62c08ffcf9
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISENOT_H__
+#define __ARM_COMPUTE_NEBITWISENOT_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseNotKernel */
+class NEBitwiseNot : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  Input tensor. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISENOT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
new file mode 100644
index 0000000000..1c9a2f9d2e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEOR_H__
+#define __ARM_COMPUTE_NEBITWISEOR_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseOrKernel */
+class NEBitwiseOr : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
new file mode 100644
index 0000000000..4690f0a4e3
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEXOR_H__
+#define __ARM_COMPUTE_NEBITWISEXOR_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseXorKernel */
+class NEBitwiseXor : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEXOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBox3x3.h b/arm_compute/runtime/NEON/functions/NEBox3x3.h
new file mode 100644
index 0000000000..2b5440a74c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBox3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBOX3x3_H__
+#define __ARM_COMPUTE_NEBOX3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute box filter 3x3. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ *  -# @ref NEBox3x3Kernel
+ *
+ */
+class NEBox3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's input, output and border mode.
+     *
+     * @note The border handler is run on the input tensor.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
+};
+}
+#endif /*__ARM_COMPUTE_NEBOX3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
new file mode 100644
index 0000000000..fbf2d90740
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECANNYEDGE_H__
+#define __ARM_COMPUTE_NECANNYEDGE_H__
+
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute canny edge on NEON. This function calls the following NEON kernels and functions:
+ *
+ *  -# @ref NEFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
+ *  -# @ref NESobel3x3 (if gradient_size == 3) or
+ *     @ref NESobel5x5 (if gradient_size == 5) or
+ *     @ref NESobel7x7 (if gradient_size == 7)
+ *  -# @ref NEGradientKernel
+ *  -# @ref NEEdgeNonMaxSuppressionKernel
+ *  -# @ref NEEdgeTraceKernel
+ *
+ */
+class NECannyEdge : public IFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize Sobel kernel to nullptr.
+     */
+    NECannyEdge();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECannyEdge(const NECannyEdge &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECannyEdge &operator=(const NECannyEdge &) = delete;
+    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8.
+     * @param[in]      upper_thr             Upper threhold used for the hysteresis
+     * @param[in]      lower_thr             Lower threshold used for the hysteresis.
+     * @param[in]      gradient_size         Gradient size (3, 5 or 7)
+     * @param[in]      norm_type             Normalization type. If 1, L1-Norm otherwise L2-Norm
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     *
+     */
+    void configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0,
+                   bool use_fp16 = false);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>    _sobel;               /**< Pointer to Sobel kernel */
+    std::unique_ptr<INEKernel>    _gradient;            /**< Gradient kernel */
+    NEEdgeNonMaxSuppressionKernel _non_max_suppr;       /**< Non-Maxima suppression kernel */
+    NEEdgeTraceKernel             _edge_trace;          /**< Edge tracing kernel */
+    NEFillBorderKernel            _border_mag_gradient; /**< Fill border on magnitude tensor kernel */
+    NEFillBorderKernel            _border_edge_trace;   /**< Fill border before edge trace */
+    Tensor                        _gx;                  /**< Source tensor - Gx component */
+    Tensor                        _gy;                  /**< Source tensor - Gy component */
+    Tensor                        _magnitude;           /**< Source tensor - Magnitude */
+    Tensor                        _phase;               /**< Source tensor - Phase */
+    Tensor                        _nonmax;              /**< Source tensor - Non-Maxima suppressed */
+    ITensor                      *_output;              /**< Output tensor provided by the user. */
+};
+}
+#endif /* __ARM_COMPUTE_NECANNYEDGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEChannelCombine.h b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
new file mode 100644
index 0000000000..7133553e1d
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELCOMBINE_H__
+#define __ARM_COMPUTE_NECHANNELCOMBINE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEChannelCombineKernel to perform channel combination. */
+class NEChannelCombine : public INESimpleFunction
+{
+public:
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output image. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECHANNELCOMBINE_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
new file mode 100644
index 0000000000..5e46eef3a6
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELEXTRACT_H__
+#define __ARM_COMPUTE_NECHANNELEXTRACT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEChannelExtractKernel to perform channel extraction. */
+class NEChannelExtract : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Format supported: U8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECHANNELEXTRACT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
new file mode 100644
index 0000000000..2997778ed5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOLORCONVERT_H__
+#define __ARM_COMPUTE_NECOLORCONVERT_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class IMultiImage;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEColorConvertKernel to perform color conversion */
+class NEColorConvert : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The input single-planar tensor from which to convert
+     * @param[in] output The converted single-planar output tensor
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted single-planar output image
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The single-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECOLORCONVERT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
new file mode 100644
index 0000000000..1704d9fa94
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTION_H__
+#define __ARM_COMPUTE_NECONVOLUTION_H__
+
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute convolution of size 3x3. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolution3x3Kernel
+ *
+ */
+class NEConvolution3x3 : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+
+/** Basic function to execute convolution of size 5x5, 7x7, 9x9. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolutionKernel or<br/>
+ *    @ref NESeparableConvolutionHorKernel and @ref NESeparableConvolutionVertKernel (if convolution matrix is separable)
+ *
+ */
+template <unsigned int matrix_size>
+class NEConvolutionSquare : public IFunction
+{
+public:
+    /** Default constructor */
+    NEConvolutionSquare();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    Tensor                                        _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                          _is_separable;   /**< true if the convolution can be separated */
+    NESeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    NESeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    NEConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
+    NEFillBorderKernel                            _border_handler; /**< kernel for border handling */
+};
+
+/** Basic function to run 5x5 convolution. */
+using NEConvolution5x5 = NEConvolutionSquare<5>;
+/** Basic function to run 7x7 convolution. */
+using NEConvolution7x7 = NEConvolutionSquare<7>;
+/** Basic function to run 9x9 convolution. */
+using NEConvolution9x9 = NEConvolutionSquare<9>;
+
+/** Basic function to execute non-square convolution. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolutionRectangleKernel or<br/>
+ *
+ * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ */
+class NEConvolutionRectangle : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     rows                  Rows of convolution kernel.
+     * @param[in]     cols                  Columns of convolution kernel.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NECONVOLUTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
new file mode 100644
index 0000000000..a8fff8d047
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NECONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Function to reshape and perform 1xW transposition on the weights. This function calls the following kernels:
+ * -# @ref NEWeightsReshapeKernel
+ * -# @ref NEGEMMTranspose1xWKernel (executed in case GEMM is required for the operation)
+ */
+class NEConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEWeightsReshapeKernel   _weights_reshape_kernel;
+    NEGEMMTranspose1xWKernel _weights_transposed_kernel;
+    Tensor                   _weights_reshaped;
+    bool                     _transpose1xW;
+};
+
+/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
+ * -# @ref NEWeightsReshapeKernel   (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NEGEMMInterleave4x4Kernel (executed only in case GEMM is required for the operation)
+ * -# @ref NEGEMMMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NEConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: QS8/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                   _input_im2col_kernel;
+    NEGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    NEConvolutionLayerReshapeWeights _reshape_weights;
+    NEGEMMMatrixMultiplyKernel       _mm_kernel;
+    NECol2ImKernel                   _output_col2im_kernel;
+    Tensor                           _input_im2col_reshaped;
+    Tensor                           _input_interleaved_reshaped;
+    Tensor                           _weights_reshaped;
+    Tensor                           _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
+};
+}
+#endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
new file mode 100644
index 0000000000..02ff1227c7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+class NEDepthConcatenateKernel;
+class NEFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref NEDepthConcatenateKernel
+ *
+ */
+class NEDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ITensor *>                      _inputs_vector;
+    std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<NEFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
new file mode 100644
index 0000000000..7c59ce432d
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONVERT_H__
+#define __ARM_COMPUTE_NEDEPTHCONVERT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**Basic function to run @ref NEDepthConvertKernel */
+class NEDepthConvert : public INESimpleFunction
+{
+public:
+    /* Contructor */
+    NEDepthConvert() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEDepthConvert(const NEDepthConvert &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    const NEDepthConvert &operator=(const NEDepthConvert &) = delete;
+    /** Initialize the function's source, destination
+     *
+     * Input format must be different than output format.
+     *
+     * Valid conversions Input -> Output :
+     *    QS8 -> F32
+     *    U8 -> U16, S16, S32
+     *    U16 -> U8, U32
+     *    S16 -> U8, S32
+     *    F32 -> QS8
+     *
+     *
+     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
+     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     *                    It is not used on fixed point conversion.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+#endif /*__ARM_COMPUTE_NEDEPTHCONVERT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEDerivative.h b/arm_compute/runtime/NEON/functions/NEDerivative.h
new file mode 100644
index 0000000000..57b7409b39
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDerivative.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDERIVATIVE_H__
+#define __ARM_COMPUTE_NEDERIVATIVE_H__
+
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute first order derivative operator. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEDerivativeKernel
+ *
+ */
+class NEDerivative : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDerivative();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination tensor. Derivative along the X direction. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination tensor. Derivative along the Y direction. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDerivativeKernel _kernel;         /**< Derivative kernel */
+    NEFillBorderKernel _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /* __ARM_COMPUTE_NEDERIVATIVE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h
new file mode 100644
index 0000000000..17bdb3363e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDilate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDILATE_H__
+#define __ARM_COMPUTE_NEDILATE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute dilate. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEDilateKernel
+ *
+ */
+class NEDilate : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode.
+     *
+     * @param[in, out] input                 First tensor input. Data type supported: U8.(Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_NEDILATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
new file mode 100644
index 0000000000..a356cac7c8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref NEDirectConvolutionLayerBiasAccumulateKernel
+ * -# @ref NEDirectConvolutionLayerKernel
+ */
+class NEDirectConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEDirectConvolutionLayer();
+    /** Set the input, weights, biases and output tensors.
+      *
+      * @param[in, out] input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]      weights   Set of kernels to convolve the input volume.
+      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                           Data type supported: Same as @p input.
+      * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
+      * @param[out]     output    Output tensor.
+      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
+    NEDirectConvolutionLayerKernel               _conv_kernel;
+    NEFillBorderKernel                           _input_border_handler;
+    Tensor                                       _accumulator;
+};
+}
+#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
new file mode 100644
index 0000000000..6cf8008480
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__
+#define __ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__
+
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "arm_compute/runtime/Distribution1D.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Lut.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute histogram equalization. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHistogramKernel
+ * -# @ref NECumulativeDistributionKernel
+ * -# @ref NETableLookupKernel
+ *
+ */
+class NEEqualizeHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEEqualizeHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @note Currently the width of the input image must be a multiple of 16.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] output Output image. Data type supported: same as @p input
+     */
+    void configure(const IImage *input, IImage *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEHistogramKernel              _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    NECumulativeDistributionKernel _cd_histogram_kernel;     /**< Kernel that calculates the cumulative distribution
+                                                                  and creates the relevant LookupTable. */
+    NETableLookupKernel            _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    Distribution1D                 _hist;                    /**< Distribution that holds the histogram of the input image. */
+    Distribution1D                 _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    Lut                            _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static constexpr uint32_t      nr_bins{ 256 };           /**< Histogram bins of the internal histograms. */
+    static constexpr uint32_t      max_range{ nr_bins - 1 }; /**< Histogram range of the internal histograms. */
+};
+}
+#endif /*__ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
new file mode 100644
index 0000000000..940ae18471
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEErode.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEERODE_H__
+#define __ARM_COMPUTE_NEERODE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute erode. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEErodeKernel
+ *
+ */
+class NEErode : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode
+     *
+     * @param[in, out] input                 First tensor input. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_NEERODE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h
new file mode 100644
index 0000000000..d7c31750c5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFastCorners.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFASTCORNERS_H__
+#define __ARM_COMPUTE_NEFASTCORNERS_H__
+
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute fast corners. This function call the following NEON kernels:
+ *
+ * -# @ref NEFastCornersKernel
+ * -# @ref NENonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
+ * -# @ref NEFillArrayKernel
+ *
+ */
+class NEFastCorners : public IFunction
+{
+public:
+    /** Constructor */
+    NEFastCorners();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]      nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
+     * @param[out]     corners               Array of keypoints to store the results.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEFastCornersKernel             _fast_corners_kernel;
+    NEFillBorderKernel              _border_handler;
+    NENonMaximaSuppression3x3Kernel _nonmax_kernel;
+    NEFillArrayKernel               _fill_kernel;
+    Image                           _output;
+    Image                           _suppressed;
+    bool                            _non_max;
+};
+}
+#endif /*__ARM_COMPUTE_NEFASTCORNERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
new file mode 100644
index 0000000000..b6b7e77471
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLBORDER_H__
+#define __ARM_COMPUTE_NEFILLBORDER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEFillBorderKernel */
+class NEFillBorder : public IFunction
+{
+public:
+    /** Initialize the function's source, destination and border_mode.
+     *
+     * @note This function fills the borders within the XY-planes.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8/QS8/S16/S32/F32
+     * @param[in]      border_width          Width of the tensor border in pixels.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEFillBorderKernel _border_handler; /**< Kernel to handle image borders */
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLBORDER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
new file mode 100644
index 0000000000..33ec4ef721
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
+ *
+ *  -# @ref NETransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref NEGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NETransposeKernel        _transpose_kernel;
+    NEGEMMTranspose1xWKernel _transpose1xW_kernel;
+    Tensor                   _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels:
+ *  -# @ref NEIm2ColKernel                      (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped flag is set to false) (called once)
+ *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input                Source tensor. Data type supported: QS8/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
+     * @param[in]  biases               Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose the weights tensor if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    void configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
+
+    NEIm2ColKernel                      _im2col_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    NEGEMMMatrixMultiplyKernel          _mm_kernel;
+    NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    Tensor                              _im2col_output;
+    Tensor                              _interleave4x4_output;
+    Tensor                              _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
+};
+}
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
new file mode 100644
index 0000000000..a40aa910a5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMM_H__
+#define __ARM_COMPUTE_NEGEMM_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ *
+ */
+class NEGEMM : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMM();
+    /** Initialise the kernel's inputs, output
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
+     *
+     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: QS8/F16/F32
+     * @param[in]  b     Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in]  c     Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
+     * @param[out] d     Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha Weight of the matrix product
+     * @param[in]  beta  Weight of matrix C
+     */
+    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEGEMMInterleave4x4Kernel  _interleave_kernel;
+    NEGEMMTranspose1xWKernel   _transpose_kernel;
+    NEGEMMMatrixMultiplyKernel _mm_kernel;
+    NEGEMMMatrixAdditionKernel _ma_kernel;
+    Tensor                     _tmp_a;
+    Tensor                     _tmp_b;
+    bool                       _run_vector_matrix_multiplication;
+    bool                       _run_addition;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
new file mode 100644
index 0000000000..b911fd064f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute NEGEMMInterleave4x4Kernel. This function calls the following NEON kernel:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel
+ *
+ */
+class NEGEMMInterleave4x4 : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
new file mode 100644
index 0000000000..bfb1a494b8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__
+#define __ARM_COMPUTE_NEGEMMLOWP_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels:
+*
+*  -# @ref NEGEMMInterleave4x4Kernel
+*  -# @ref NEGEMMTranspose1xWKernel
+*  -# @ref NEGEMMLowpMatrixMultiplyKernel
+*
+*/
+class NEGEMMLowp : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMLowp();
+    /** Initialise the kernel's inputs, output
+    *
+    * @note GEMM_LOWP:  low precision GEMM kernel
+    *  This kernel performs the following computation:
+    *
+    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+    *  -# Compute the int32 matrix product of the resulting a * b.
+    *  -# Add output_offset to each entry of the result.
+    *  -# Multiply each entry of the result and round to the nearest integer
+    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+    *
+    * @param[in]  a               First input tensor  (Matrix A). Data type supported: U8.
+    * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
+    * @param[out] output          Output tensor. Data type supported: same as @p a.
+    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+    * @param[in]  output_offset   Offset to be added to each element of the output matrix
+    * @param[in]  output_mult_int Value to be multiplied to each element of the output matrix
+    * @param[in]  shift           Number of bits to shift right the result.
+    */
+    void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEGEMMInterleave4x4Kernel      _interleave_kernel;
+    NEGEMMTranspose1xWKernel       _transpose_kernel;
+    NEGEMMLowpMatrixMultiplyKernel _mm_kernel;
+    Tensor                         _tmp_a;
+    Tensor                         _tmp_b;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
new file mode 100644
index 0000000000..447b8c9c70
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__
+#define __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute NEGEMMTranspose1xWKernel. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEGEMMTranspose1xWKernel
+ *
+ */
+class NEGEMMTranspose1xW : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
new file mode 100644
index 0000000000..a237e6f0e5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3_H__
+#define __ARM_COMPUTE_NEGAUSSIAN3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute gaussian filter 3x3. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian3x3Kernel
+ *
+ */
+class NEGaussian3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's input, output and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
new file mode 100644
index 0000000000..699e42efb4
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5_H__
+#define __ARM_COMPUTE_NEGAUSSIAN5x5_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute gaussian filter 5x5. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian5x5HorKernel
+ * -# @ref NEGaussian5x5VertKernel
+ *
+ */
+class NEGaussian5x5 : public IFunction
+{
+public:
+    /** Default constructor
+     */
+    NEGaussian5x5();
+    /** Initialise the function's input, output and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
+    NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
+    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
+    NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
new file mode 100644
index 0000000000..5f0a67ea05
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMID_H__
+#define __ARM_COMPUTE_NEGAUSSIANPYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for all Gaussian pyramid functions */
+class NEGaussianPyramid : public IFunction
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramid();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramid(const NEGaussianPyramid &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramid &operator=(const NEGaussianPyramid &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramid(NEGaussianPyramid &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramid &operator=(NEGaussianPyramid &&) = default;
+    /** Default destructor */
+    virtual ~NEGaussianPyramid() = default;
+
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: U8.
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    virtual void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) = 0;
+
+protected:
+    const ITensor *_input;
+    IPyramid      *_pyramid;
+    Pyramid        _tmp;
+};
+
+/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussianPyramidHorKernel
+ * -# @ref NEGaussianPyramidVertKernel
+ *
+ */
+class NEGaussianPyramidHalf : public NEGaussianPyramid
+{
+public:
+    /** Constructor */
+    NEGaussianPyramidHalf();
+
+    // Inherited methods overridden:
+    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<NEFillBorderKernel[]>          _border_handler;
+    std::unique_ptr<NEGaussianPyramidHorKernel[]>  _horizontal_reduction;
+    std::unique_ptr<NEGaussianPyramidVertKernel[]> _vertical_reduction;
+};
+
+/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian5x5
+ * -# @ref NEScaleKernel
+ *
+ */
+class NEGaussianPyramidOrb : public NEGaussianPyramid
+{
+public:
+    /** Constructor */
+    NEGaussianPyramidOrb();
+
+    // Inherited methods overridden:
+    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<Image[]>         _offsets;
+    std::unique_ptr<NEGaussian5x5[]> _gaus5x5;
+    std::unique_ptr<NEScaleKernel[]> _scale_nearest;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
new file mode 100644
index 0000000000..b7b4909060
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_NEHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHOGGradient
+ * -# @ref NEHOGOrientationBinningKernel
+ * -# @ref NEHOGBlockNormalizationKernel
+ *
+ */
+class NEHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEHOGGradient                 _gradient;
+    NEHOGOrientationBinningKernel _orient_bin;
+    NEHOGBlockNormalizationKernel _block_norm;
+    Tensor                        _mag;
+    Tensor                        _phase;
+    Tensor                        _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
new file mode 100644
index 0000000000..98b8a89bc1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDETECTOR_H__
+#define __ARM_COMPUTE_NEHOGDETECTOR_H__
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following NEON kernel:
+ *
+ * -# @ref NEHOGDetectorKernel
+ *
+ */
+class NEHOGDetector : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
new file mode 100644
index 0000000000..dd2d99adfe
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGGRADIENT_H__
+#define __ARM_COMPUTE_NEHOGGRADIENT_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+/** Basic function to calculate the gradient for HOG. This function calls the following NEON kernels:
+ *
+ * -# @ref NEDerivative
+ * -# NEMagnitudePhaseKernel
+ *
+ */
+class NEHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEDerivative               _derivative;
+    std::unique_ptr<INEKernel> _mag_phase;
+    Tensor                     _gx;
+    Tensor                     _gy;
+};
+}
+#endif /*__ARM_COMPUTE_NEHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
new file mode 100644
index 0000000000..2d07e6435f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHOGGradient
+ * -# @ref NEHOGOrientationBinningKernel
+ * -# @ref NEHOGBlockNormalizationKernel
+ * -# @ref NEHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class NEHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGMultiDetection(const NEHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGMultiDetection &operator=(const NEHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<NEHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<NEHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<NEHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<Tensor[]>                                     _hog_space;
+    std::unique_ptr<Tensor[]>                                     _hog_norm_space;
+    IDetectionWindowArray                                        *_detection_windows;
+    Tensor                                                        _mag;
+    Tensor                                                        _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGMULTIDETECTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
new file mode 100644
index 0000000000..a709871153
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHARRISCORNERS_H__
+#define __ARM_COMPUTE_NEHARRISCORNERS_H__
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute harris corners detection. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NESobel3x3 (if gradient_size == 3) or<br/>
+ *    @ref NESobel5x5 (if gradient_size == 5) or<br/>
+ *    @ref NESobel7x7 (if gradient_size == 7)
+ * -# @ref NEFillBorderKernel
+ * -# NEHarrisScoreKernel<3> (if block_size == 3) or<br/>
+ *    NEHarrisScoreKernel<5> (if block_size == 5) or<br/>
+ *    NEHarrisScoreKernel<7> (if block_size == 7)
+ * -# @ref NENonMaximaSuppression3x3
+ * -# @ref CPPCornerCandidatesKernel
+ * -# @ref CPPSortEuclideanDistanceKernel
+ *
+ */
+class NEHarrisCorners : public IFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize _sobel, _harris_score and _corner_list to nullptr.
+     */
+    NEHarrisCorners();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]      min_dist              Radial Euclidean distance for the euclidean diatance stage
+     * @param[in]      sensitivity           Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]      gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
+     * @param[in]      block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
+     * @param[out]     corners               Array of keypoints to store the results.
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(IImage *input, float threshold, float min_dist, float sensitivity,
+                   int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>            _sobel;                 /**< Sobel function */
+    std::unique_ptr<INEHarrisScoreKernel> _harris_score;          /**< Harris score kernel */
+    NENonMaximaSuppression3x3             _non_max_suppr;         /**< Non-maxima suppression function */
+    CPPCornerCandidatesKernel             _candidates;            /**< Sort kernel */
+    CPPSortEuclideanDistanceKernel        _sort_euclidean;        /**< Euclidean distance kernel */
+    NEFillBorderKernel                    _border_gx;             /**< Border handler before running harris score */
+    NEFillBorderKernel                    _border_gy;             /**< Border handler before running harris score */
+    Image                                 _gx;                    /**< Source image - Gx component */
+    Image                                 _gy;                    /**< Source image - Gy component */
+    Image                                 _score;                 /**< Source image - Harris score */
+    Image                                 _nonmax;                /**< Source image - Non-Maxima suppressed image */
+    std::unique_ptr<InternalKeypoint[]>   _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
+    int32_t                               _num_corner_candidates; /**< Number of potential corner candidates */
+};
+}
+#endif /*__ARM_COMPUTE_NEHARRISCORNERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHistogram.h b/arm_compute/runtime/NEON/functions/NEHistogram.h
new file mode 100644
index 0000000000..c24510dcb3
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHistogram.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHISTOGRAM_H__
+#define __ARM_COMPUTE_NEHISTOGRAM_H__
+
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class IDistribution1D;
+
+/** Basic function to run @ref NEHistogramKernel. */
+class NEHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] output Output distribution.
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEHistogramKernel           _histogram_kernel;
+    std::unique_ptr<uint32_t[]> _local_hist;
+    std::unique_ptr<uint32_t[]> _window_lut;
+    size_t                      _local_hist_size;
+    /** 256 possible pixel values as we handle only U8 images */
+    static constexpr unsigned int window_lut_default_size = 256;
+};
+}
+#endif /*__ARM_COMPUTE_NEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
new file mode 100644
index 0000000000..6d7dd697e8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINTEGRALIMAGE_H__
+#define __ARM_COMPUTE_NEINTEGRALIMAGE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run a @ref NEIntegralImageKernel */
+class NEIntegralImage : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+        *
+        * @param[in]  input  Source tensor. Data type supported: U8.
+        * @param[out] output Destination tensor. Data type supported: U32.
+        */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEINTEGRALIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
new file mode 100644
index 0000000000..991ae7c293
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELAPLACIANPYRAMID_H__
+#define __ARM_COMPUTE_NELAPLACIANPYRAMID_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/Pyramid.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute laplacian pyramid. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEGaussianPyramidHalf
+ * -# @ref NEGaussian5x5
+ * -# @ref NEArithmeticSubtraction
+ *
+ *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
+ *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
+ *  L(i) = I(i) - Gaussian5x5(I(i))
+ *  Level 0 has always the same first two dimensions as the input tensor.
+*/
+class NELaplacianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    NELaplacianPyramid();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: S16.
+     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data type supported: S16.
+     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
+     *                                   out.width = in.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    size_t                                     _num_levels;
+    NEGaussianPyramidHalf                      _gaussian_pyr_function;
+    std::unique_ptr<NEGaussian5x5[]>           _convf;
+    std::unique_ptr<NEArithmeticSubtraction[]> _subf;
+    Pyramid                                    _gauss_pyr;
+    Pyramid                                    _conv_pyr;
+    NEDepthConvert                             _depth_function;
+};
+}
+#endif /*__ARM_COMPUTE_NELAPLACIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
new file mode 100644
index 0000000000..4139733499
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__
+#define __ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+#include "arm_compute/runtime/Pyramid.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute laplacian reconstruction. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEArithmeticAddition
+ * -# @ref NEScale
+ * -# @ref NEDepthConvert
+ *
+ * This function reconstructs the original image from a Laplacian Image Pyramid.
+ *
+ *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
+ *  resolution of the next pyramid level.
+ *
+ *  I(n-2) = upsample( input + L(n-1)
+ *
+ *  For each pyramid level i, except i=0 and i=n-1:
+ *  I(i-1) = upsample(I(i) + L(i))
+ *
+ *  output = I(0) + L(0)
+*/
+class NELaplacianReconstruct : public IFunction
+{
+public:
+    /** Constructor */
+    NELaplacianReconstruct();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * The Output image must have the same size as the first level of the pyramid.
+     * The Input image must have the same size as the last level of the pyramid.
+     *
+     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
+     *
+     * @param[in]  pyramid               Laplacian pyramid tensors, Data type supported at each level: S16.
+     * @param[in]  input                 Source tensor. Data type supported: S16.
+     * @param[out] output                Output tensor. Data type supported: U8.
+     * @param[in]  border_mode           Border mode to use for the convolution.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    Pyramid                                 _tmp_pyr;
+    std::unique_ptr<NEArithmeticAddition[]> _addf;
+    std::unique_ptr<NEScale[]>              _scalef;
+    NEDepthConvert                          _depthf;
+};
+}
+#endif /*__ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
new file mode 100644
index 0000000000..1b2b2ee3cf
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class INETensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NELocallyConnectedMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NELocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                         _input_im2col_kernel;
+    NEWeightsReshapeKernel                 _weights_reshape_kernel;
+    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
+    NECol2ImKernel                         _output_col2im_kernel;
+    Tensor                                 _input_im2col_reshaped;
+    Tensor                                 _weights_reshaped;
+    Tensor                                 _gemm_output;
+    bool                                   _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
new file mode 100644
index 0000000000..6c1f988ef0
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMAGNITUDE_H__
+#define __ARM_COMPUTE_NEMAGNITUDE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run NEMagnitudePhaseKernel */
+class NEMagnitude : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input1   First tensor input. Data type supported: S16.
+     * @param[in]  input2   Second tensor input. Data type supported: S16.
+     * @param[out] output   Output tensor. Data type supported: S16.
+     * @param[in]  use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16 = false);
+};
+}
+#endif /*__ARM_COMPUTE_NEMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
new file mode 100644
index 0000000000..3770b2a270
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEANSTDDEV_H__
+#define __ARM_COMPUTE_NEMEANSTDDEV_H__
+
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to execute mean and std deviation. This function calls the following NEON kernels:
+ *
+ * @ref NEMeanStdDevKernel
+ *
+ */
+class NEMeanStdDev : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEMeanStdDev();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] mean   Output average pixel value.
+     * @param[out] stddev (Optional) Output standard deviation of pixel values.
+     */
+    void configure(const IImage *input, float *mean, float *stddev = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
+    uint64_t           _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
+    uint64_t           _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+};
+}
+#endif /*__ARM_COMPUTE_NEMEANSTDDEV_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMedian3x3.h b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
new file mode 100644
index 0000000000..a3df687a35
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEDIAN3x3_H__
+#define __ARM_COMPUTE_NEMEDIAN3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute median filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEMedian3x3Kernel
+ *
+ */
+class NEMedian3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEMEDIAN3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
new file mode 100644
index 0000000000..82e75ee48b
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMINMAXLOCATION_H__
+#define __ARM_COMPUTE_NEMINMAXLOCATION_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute min and max location. This function calls the following NEON kernels:
+ *
+ * -# NEMinMaxKernel
+ * -# NEMinMaxLocationKernel
+ */
+class NEMinMaxLocation : public IFunction
+{
+public:
+    /** Constructor */
+    NEMinMaxLocation();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input     Input image. Data types supported: U8/S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   (Optional) Array of minimum value locations.
+     * @param[out] max_loc   (Optional) Array of maximum value locations.
+     * @param[out] min_count (Optional) Number of minimum value encounters.
+     * @param[out] max_count (Optional) Number of maximum value encounters.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEMinMaxKernel         _min_max;     /**< Kernel that performs min/max */
+    NEMinMaxLocationKernel _min_max_loc; /**< Kernel that extracts min/max locations */
+};
+}
+#endif /*__ARM_COMPUTE_NEMINMAXLOCATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
new file mode 100644
index 0000000000..d8a9eaebfb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONLINEARFILTER_H__
+#define __ARM_COMPUTE_NENONLINEARFILTER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute non linear filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NENonLinearFilterKernel
+ *
+ * @note Supported mask dimensions squares of sizes 3, 5
+ */
+class NENonLinearFilter : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      function              Non linear function to perform
+     * @param[in]      mask_size             Mask size. Supported sizes: 3, 5
+     * @param[in]      pattern               Mask pattern
+     * @param[in]      mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
+                   uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NENONLINEARFILTER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
new file mode 100644
index 0000000000..c87d722878
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__
+#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NENonMaximaSuppression3x3Kernel
+ *
+ */
+class NENonMaximaSuppression3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *       The constant values used with CONSTANT border mode is 0
+     *
+     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
+     * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode);
+};
+}
+#endif /* __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
new file mode 100644
index 0000000000..3202867c43
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a normalization layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEFillBorderKernel
+ * -# @ref NENormalizationLayerKernel
+ *
+ */
+class NENormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NENormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/F32
+     * @param[out] output    Destination with the same dimensions, data type and number of channels of  @p input
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
+    NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
+    NEFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
+    Tensor                          _input_squared;   /**< The intermediate buffer which stores results of squaring input */
+};
+}
+#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
new file mode 100644
index 0000000000..0534551d19
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEOPTICALFLOW_H__
+#define __ARM_COMPUTE_NEOPTICALFLOW_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class Pyramid;
+
+using LKInternalKeypointArray = Array<NELKInternalKeypoint>;
+/** Basic function to execute optical flow. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEScharr3x3
+ * -# @ref NELKTrackerKernel
+ *
+ */
+class NEOpticalFlow : public IFunction
+{
+public:
+    /** Constructor */
+    NEOpticalFlow();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEOpticalFlow(const NEOpticalFlow &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEOpticalFlow &operator=(const NEOpticalFlow &) = delete;
+    /**  Initialise the function input and output
+     *
+     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data type supported U8
+     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data type supported U8
+     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
+     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
+     * @param[in]  termination           The criteria to terminate the search of each keypoint.
+     * @param[in]  epsilon               The error for terminating the algorithm
+     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
+     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
+     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
+     * @param[in]  border_mode           The border mode applied at scharr kernel stage
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
+     *
+     */
+    void configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
+                   IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<NEScharr3x3[]>       _func_scharr;
+    std::unique_ptr<NELKTrackerKernel[]> _kernel_tracker;
+    std::unique_ptr<Tensor[]>            _scharr_gx;
+    std::unique_ptr<Tensor[]>            _scharr_gy;
+    IKeyPointArray                      *_new_points;
+    const IKeyPointArray                *_new_points_estimates;
+    const IKeyPointArray                *_old_points;
+    LKInternalKeypointArray              _new_points_internal;
+    LKInternalKeypointArray              _old_points_internal;
+    unsigned int                         _num_levels;
+};
+}
+#endif /*__ARM_COMPUTE_NEOPTICALFLOW_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
new file mode 100644
index 0000000000..985ba84c4c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPhase.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPHASE_H__
+#define __ARM_COMPUTE_NEPHASE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run NEMagnitudePhaseKernel */
+class NEPhase : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  input1 First tensor input. Data type supported: S16.
+     * @param[in]  input2 Second tensor input. Data type supported: S16.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEPHASE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
new file mode 100644
index 0000000000..de7a797cd8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__
+#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+class NEPixelWiseMultiplication : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1          First tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          Output tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  scale           Scale to apply after multiplication. Must be positive.
+     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+};
+}
+#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
new file mode 100644
index 0000000000..5a9cffa5ae
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPOOLINGLAYER_H__
+#define __ARM_COMPUTE_NEPOOLINGLAYER_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref NEPoolingLayerKernel
+ */
+class NEPoolingLayer : public INESimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32.
+     * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+};
+}
+#endif /* __ARM_COMPUTE_NEPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
new file mode 100644
index 0000000000..b1ec559817
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NERemap.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREMAP_H__
+#define __ARM_COMPUTE_NEREMAP_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute remap. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NERemapKernel
+ */
+class NERemap : public INESimpleFunction
+{
+public:
+    /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
+     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
+     * @param[in]      border_mode           Border mode to use on the input tensor.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
+                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEREMAP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
new file mode 100644
index 0000000000..e1da891dcf
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCALEIMAGE_H__
+#define __ARM_COMPUTE_NESCALEIMAGE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEScaleKernel */
+class NEScale : public INESimpleFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize NEScale
+     */
+    NEScale();
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+private:
+    Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    Tensor _dx;      /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor _dy;      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+};
+}
+#endif /*__ARM_COMPUTE_NESCALEIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScharr3x3.h b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
new file mode 100644
index 0000000000..db24723902
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHARR3x3_H__
+#define __ARM_COMPUTE_NESCHARR3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute scharr 3x3 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEScharr3x3Kernel
+ *
+ */
+class NEScharr3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NESCHARR3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel3x3.h b/arm_compute/runtime/NEON/functions/NESobel3x3.h
new file mode 100644
index 0000000000..e2896ba058
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel3x3.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL3x3_H__
+#define __ARM_COMPUTE_NESOBEL3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 3x3 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel3x3Kernel
+ *
+ */
+class NESobel3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
new file mode 100644
index 0000000000..fc4d665a70
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL5x5_H__
+#define __ARM_COMPUTE_NESOBEL5x5_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 5x5 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel5x5HorKernel
+ * -# @ref NESobel5x5VertKernel
+ *
+ */
+class NESobel5x5 : public IFunction
+{
+public:
+    /** Default constructor */
+    NESobel5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NESobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    NESobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
+    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL5x5_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
new file mode 100644
index 0000000000..06b7c80ad6
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL7x7_H__
+#define __ARM_COMPUTE_NESOBEL7x7_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 7x7 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel7x7HorKernel
+ * -# @ref NESobel7x7VertKernel
+ *
+ */
+class NESobel7x7 : public IFunction
+{
+public:
+    /** Default constructor */
+    NESobel7x7();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data type supported: S32.
+     * @param[out]     output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data type supported: S32.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NESobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    NESobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
+    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL7x7_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
new file mode 100644
index 0000000000..dc84dec0e4
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOFTMAXLAYER_H__
+#define __ARM_COMPUTE_NESOFTMAXLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to compute a SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
+ *
+ * This function runs the following kernels:
+ * -# @ref NELogits1DMaxKernel
+ * -# @ref NELogits1DShiftExpSumKernel
+ * -# @ref NELogits1DNormKernel
+ */
+class NESoftmaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NESoftmaxLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8/F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NELogits1DMaxKernel         _max_kernel;
+    NELogits1DShiftExpSumKernel _shift_exp_sum_kernel;
+    NELogits1DNormKernel        _norm_kernel;
+    NEFillBorderKernel          _fill_border_kernel;
+    Tensor                      _max;
+    Tensor                      _sum;
+    Tensor                      _tmp;
+};
+}
+#endif /* __ARM_COMPUTE_NESOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
new file mode 100644
index 0000000000..b59ffb877c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETABLELOOKUP_H__
+#define __ARM_COMPUTE_NETABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Basic function to run @ref NETableLookupKernel */
+class NETableLookup : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  First tensor input. Data types supported: U8/S16
+     * @param[in]  lut    Input lookup table.
+     * @param[out] output Output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NETABLELOOKUP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEThreshold.h b/arm_compute/runtime/NEON/functions/NEThreshold.h
new file mode 100644
index 0000000000..d407ee5b15
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEThreshold.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETHRESHOLD_H__
+#define __ARM_COMPUTE_NETHRESHOLD_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEThresholdKernel */
+class NEThreshold : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
+     * @param[in]  input       First tensor input. Data type supported: U8.
+     * @param[out] output      Output tensor. Data type supported: U8.
+     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold
+     * @param[in]  false_value Value to assign when the condition is false
+     * @param[in]  true_value  value to assign when the condition is true
+     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
+     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
+     */
+    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value = 0, uint8_t true_value = 0,
+                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLD_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
new file mode 100644
index 0000000000..4b606e7282
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSE_H__
+#define __ARM_COMPUTE_NETRANSPOSE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to transpose a matrix on NEON. This function calls the following NEON kernel:
+ *
+ *  -# @ref NETransposeKernel
+ *
+ */
+class NETranspose : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_NETRANSPOSE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpAffine.h b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
new file mode 100644
index 0000000000..f8eebe8d2a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPAFFINE_H__
+#define __ARM_COMPUTE_NEWARPAFFINE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEWarpAffineKernel */
+class NEWarpAffine : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      matrix                The perspective matrix. Must be 2x3 of type float.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPAFFINE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
new file mode 100644
index 0000000000..d0699291b1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPPERSPECTIVE_H__
+#define __ARM_COMPUTE_NEWARPPERSPECTIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEWarpPerspectiveKernel */
+class NEWarpPerspective : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      matrix                The perspective matrix. Must be 3x3 of type float.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPPERSPECTIVE_H__ */
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
new file mode 100644
index 0000000000..21df6a699d
--- /dev/null
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OMPSCHEDULER_H__
+#define __ARM_COMPUTE_OMPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class OMPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the number returned by omp_get_max_threads() will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the OMPScheduler has in its pool.
+     *
+     * @return Number of threads available in OMPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static OMPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    OMPScheduler();
+
+    unsigned int _num_threads;
+};
+}
+#endif /* __ARM_COMPUTE_OMPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Pyramid.h b/arm_compute/runtime/Pyramid.h
new file mode 100644
index 0000000000..2e7613759f
--- /dev/null
+++ b/arm_compute/runtime/Pyramid.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PYRAMID_H__
+#define __ARM_COMPUTE_PYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstddef>
+#include <memory>
+
+namespace arm_compute
+{
+class Tensor;
+
+/** Basic implementation of the pyramid interface */
+class Pyramid : public IPyramid
+{
+public:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init(const PyramidInfo &info);
+
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init_auto_padding(const PyramidInfo &info);
+
+    /** Allocate the planes in the pyramid */
+    void allocate();
+
+    // Inherited method overridden
+    const PyramidInfo *info() const override;
+    Tensor *get_pyramid_level(size_t index) const override;
+
+private:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info         Pyramid's metadata
+     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
+     */
+    void internal_init(const PyramidInfo &info, bool auto_padding);
+
+    PyramidInfo               _info{};
+    std::unique_ptr<Tensor[]> _pyramid{ nullptr };
+};
+}
+#endif /*__ARM_COMPUTE_PYRAMID_H__ */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
new file mode 100644
index 0000000000..21f944b75f
--- /dev/null
+++ b/arm_compute/runtime/Scheduler.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SCHEDULER_H__
+#define __ARM_COMPUTE_SCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Configurable scheduler which supports multiple multithreading APIs and choosing between different schedulers at runtime. */
+class Scheduler
+{
+public:
+    enum class Type
+    {
+        ST,    // Single thread.
+        CPP,   // C++11 threads.
+        OMP,   // OpenMP.
+        CUSTOM // Provided by the user.
+    };
+    /** Sets the user defined scheduler and makes it the active scheduler.
+     *
+     * @param[in] scheduler A shared pointer to a custom scheduler implemented by the user.
+     */
+    static void set(std::shared_ptr<IScheduler> &scheduler);
+    /** Access the scheduler singleton.
+     *
+     * @return A reference to the scheduler object.
+     */
+    static IScheduler &get();
+    /** Set the active scheduler.
+     *
+     * Only one scheduler can be enabled at any time.
+     *
+     * @param[in] t the type of the scheduler to be enabled.
+     */
+    static void set(Type t);
+    /** Returns the type of the active scheduler.
+     *
+     * @return The current scheduler's type.
+     */
+    static Type get_type();
+    /** Returns true if the given scheduler type is supported. False otherwise.
+     *
+     * @return true if the given scheduler type is supported. False otherwise.
+     */
+    static bool is_available(Type t);
+
+private:
+    static Type                        _scheduler_type;
+    static std::shared_ptr<IScheduler> _custom_scheduler;
+    Scheduler();
+};
+}
+#endif /* __ARM_COMPUTE_SCHEDULER_H__ */
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
new file mode 100644
index 0000000000..a6e1defe7c
--- /dev/null
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+#define __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class SingleThreadScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads This is ignored for this scheduler as the number of threads is always one.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the SingleThreadScheduler has, which is always 1.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static SingleThreadScheduler &get();
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    SingleThreadScheduler() = default;
+};
+}
+#endif /* __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__ */
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
new file mode 100644
index 0000000000..bdb229de49
--- /dev/null
+++ b/arm_compute/runtime/SubTensor.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSOR_H__
+#define __ARM_COMPUTE_SUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the sub-tensor interface */
+class SubTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~SubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    SubTensor(const SubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    SubTensor &operator=(const SubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    SubTensor(SubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensor &operator=(SubTensor &&) = default;
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ITensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    ITensor              *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSOR_H__ */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
new file mode 100644
index 0000000000..1fe73a2353
--- /dev/null
+++ b/arm_compute/runtime/Tensor.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSOR_H__
+#define __ARM_COMPUTE_TENSOR_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the tensor interface */
+class Tensor : public ITensor
+{
+public:
+    /** Constructor */
+    Tensor();
+    /** Destructor: free the tensor's memory */
+    ~Tensor() = default;
+    /** Allow instances of this class to be move constructed */
+    Tensor(Tensor &&) = default;
+    /** Allow instances of this class to be moved */
+    Tensor &operator=(Tensor &&) = default;
+    /** Return a pointer to the tensor's allocator
+     *
+     * @return A pointer to the tensor's allocator
+     */
+    TensorAllocator *allocator();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
+};
+
+using Image = Tensor;
+}
+#endif /*__ARM_COMPUTE_TENSOR_H__ */
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
new file mode 100644
index 0000000000..450323b3ab
--- /dev/null
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORALLOCATOR_H__
+#define __ARM_COMPUTE_TENSORALLOCATOR_H__
+
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class Coordinates;
+class TensorInfo;
+
+/** Basic implementation of a CPU memory tensor allocator. */
+class TensorAllocator : public ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    TensorAllocator();
+
+    /** Make ITensorAllocator's init methods available */
+    using ITensorAllocator::init;
+
+    /** Shares the same backing memory with another tensor allocator, while the tensor info might be different.
+     *  In other words this can be used to create a sub-tensor from another tensor while sharing the same memory.
+     *
+     * @note TensorAllocator have to be of the same specialized type.
+     *
+     * @param[in] allocator The allocator that owns the backing memory to be shared. Ownership becomes shared afterwards.
+     * @param[in] coords    The starting coordinates of the new tensor inside the parent tensor.
+     * @param[in] sub_info  The new tensor information (e.g. shape etc)
+     */
+    void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info);
+
+    /** Returns the pointer to the allocated data. */
+    uint8_t *data() const;
+
+    /** Allocate size specified by TensorInfo of CPU memory.
+     *
+     * @note The tensor must not already be allocated when calling this function.
+     *
+     */
+    void allocate() override;
+
+    /** Free allocated CPU memory.
+     *
+     * @note The tensor must have been allocated when calling this function.
+     *
+     */
+    void free() override;
+
+protected:
+    /** No-op for CPU memory
+     *
+     * @return A pointer to the beginning of the tensor's allocation.
+     */
+    uint8_t *lock() override;
+
+    /** No-op for CPU memory. */
+    void unlock() override;
+
+private:
+    std::shared_ptr<std::vector<uint8_t>> _buffer; /**< CPU memory allocation. */
+};
+}
+#endif /* __ARM_COMPUTE_TENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h
new file mode 100644
index 0000000000..2f037a0621
--- /dev/null
+++ b/arm_compute/runtime/Utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_UTILS_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+}
+#endif /* __ARM_COMPUTE_RUNTIME_UTILS_H__ */
-- 
cgit v1.2.1