From 6ff3b19ee6120edf015fad8caab2991faa3070af Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Mon, 4 Sep 2017 18:44:23 +0100
Subject: COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
---
 src/runtime/CL/CLDistribution1D.cpp                |  61 ++++
 src/runtime/CL/CLHOG.cpp                           |  84 +++++
 src/runtime/CL/CLLut.cpp                           |  99 ++++++
 src/runtime/CL/CLLutAllocator.cpp                  |  77 +++++
 src/runtime/CL/CLMultiHOG.cpp                      |  52 ++++
 src/runtime/CL/CLMultiImage.cpp                    | 168 ++++++++++
 src/runtime/CL/CLPyramid.cpp                       | 130 ++++++++
 src/runtime/CL/CLScheduler.cpp                     |  49 +++
 src/runtime/CL/CLSubTensor.cpp                     |  81 +++++
 src/runtime/CL/CLTensor.cpp                        |  73 +++++
 src/runtime/CL/CLTensorAllocator.cpp               |  87 ++++++
 src/runtime/CL/ICLSimpleFunction.cpp               |  42 +++
 src/runtime/CL/functions/CLAbsoluteDifference.cpp  |  38 +++
 src/runtime/CL/functions/CLAccumulate.cpp          |  52 ++++
 src/runtime/CL/functions/CLActivationLayer.cpp     |  36 +++
 src/runtime/CL/functions/CLArithmeticAddition.cpp  |  38 +++
 .../CL/functions/CLArithmeticSubtraction.cpp       |  38 +++
 .../CL/functions/CLBatchNormalizationLayer.cpp     |  48 +++
 src/runtime/CL/functions/CLBitwiseAnd.cpp          |  38 +++
 src/runtime/CL/functions/CLBitwiseNot.cpp          |  38 +++
 src/runtime/CL/functions/CLBitwiseOr.cpp           |  38 +++
 src/runtime/CL/functions/CLBitwiseXor.cpp          |  38 +++
 src/runtime/CL/functions/CLBox3x3.cpp              |  40 +++
 src/runtime/CL/functions/CLCannyEdge.cpp           | 155 ++++++++++
 src/runtime/CL/functions/CLChannelCombine.cpp      |  45 +++
 src/runtime/CL/functions/CLChannelExtract.cpp      |  45 +++
 src/runtime/CL/functions/CLColorConvert.cpp        |  59 ++++
 src/runtime/CL/functions/CLConvolution.cpp         | 114 +++++++
 src/runtime/CL/functions/CLConvolutionLayer.cpp    | 247 +++++++++++++++
 src/runtime/CL/functions/CLDepthConcatenate.cpp    |  71 +++++
 src/runtime/CL/functions/CLDepthConvert.cpp        |  38 +++
 src/runtime/CL/functions/CLDerivative.cpp          |  40 +++
 src/runtime/CL/functions/CLDilate.cpp              |  40 +++
 src/runtime/CL/functions/CLEqualizeHistogram.cpp   | 110 +++++++
 src/runtime/CL/functions/CLErode.cpp               |  40 +++
 src/runtime/CL/functions/CLFastCorners.cpp         | 127 ++++++++
 src/runtime/CL/functions/CLFillBorder.cpp          |  38 +++
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 343 ++++++++++++++++++++
 src/runtime/CL/functions/CLGEMM.cpp                | 145 +++++++++
 src/runtime/CL/functions/CLGEMMInterleave4x4.cpp   |  36 +++
 src/runtime/CL/functions/CLGEMMLowp.cpp            |  85 +++++
 src/runtime/CL/functions/CLGaussian3x3.cpp         |  40 +++
 src/runtime/CL/functions/CLGaussian5x5.cpp         |  62 ++++
 src/runtime/CL/functions/CLGaussianPyramid.cpp     | 183 +++++++++++
 src/runtime/CL/functions/CLHOGDescriptor.cpp       |  99 ++++++
 src/runtime/CL/functions/CLHOGDetector.cpp         |  69 +++++
 src/runtime/CL/functions/CLHOGGradient.cpp         |  75 +++++
 src/runtime/CL/functions/CLHOGMultiDetection.cpp   | 240 ++++++++++++++
 src/runtime/CL/functions/CLHarrisCorners.cpp       | 157 ++++++++++
 src/runtime/CL/functions/CLHistogram.cpp           |  45 +++
 src/runtime/CL/functions/CLIntegralImage.cpp       |  46 +++
 src/runtime/CL/functions/CLLaplacianPyramid.cpp    |  99 ++++++
 .../CL/functions/CLLaplacianReconstruct.cpp        |  99 ++++++
 .../CL/functions/CLLocallyConnectedLayer.cpp       | 131 ++++++++
 src/runtime/CL/functions/CLMagnitude.cpp           |  38 +++
 src/runtime/CL/functions/CLMeanStdDev.cpp          |  53 ++++
 src/runtime/CL/functions/CLMedian3x3.cpp           |  40 +++
 src/runtime/CL/functions/CLMinMaxLocation.cpp      |  98 ++++++
 src/runtime/CL/functions/CLNonLinearFilter.cpp     |  40 +++
 .../CL/functions/CLNonMaximaSuppression3x3.cpp     |  47 +++
 src/runtime/CL/functions/CLNormalizationLayer.cpp  |  60 ++++
 src/runtime/CL/functions/CLOpticalFlow.cpp         | 150 +++++++++
 src/runtime/CL/functions/CLPhase.cpp               |  38 +++
 .../CL/functions/CLPixelWiseMultiplication.cpp     |  39 +++
 src/runtime/CL/functions/CLPoolingLayer.cpp        |  41 +++
 src/runtime/CL/functions/CLRemap.cpp               |  50 +++
 src/runtime/CL/functions/CLScale.cpp               |  45 +++
 src/runtime/CL/functions/CLScharr3x3.cpp           |  40 +++
 src/runtime/CL/functions/CLSobel3x3.cpp            |  40 +++
 src/runtime/CL/functions/CLSobel5x5.cpp            |  81 +++++
 src/runtime/CL/functions/CLSobel7x7.cpp            |  81 +++++
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |  67 ++++
 src/runtime/CL/functions/CLTableLookup.cpp         |  38 +++
 src/runtime/CL/functions/CLThreshold.cpp           |  38 +++
 src/runtime/CL/functions/CLTranspose.cpp           |  38 +++
 src/runtime/CL/functions/CLWarpAffine.cpp          |  40 +++
 src/runtime/CL/functions/CLWarpPerspective.cpp     |  40 +++
 src/runtime/CPP/CPPScheduler.cpp                   | 225 ++++++++++++++
 src/runtime/CPP/SingleThreadScheduler.cpp          |  52 ++++
 src/runtime/Distribution1D.cpp                     |  42 +++
 src/runtime/HOG.cpp                                |  51 +++
 src/runtime/ILutAllocator.cpp                      |  58 ++++
 src/runtime/ITensorAllocator.cpp                   |  51 +++
 src/runtime/Lut.cpp                                |  75 +++++
 src/runtime/LutAllocator.cpp                       |  52 ++++
 src/runtime/MultiHOG.cpp                           |  52 ++++
 src/runtime/MultiImage.cpp                         | 220 +++++++++++++
 src/runtime/NEON/INESimpleFunction.cpp             |  39 +++
 .../NEON/functions/NEAbsoluteDifference.cpp        |  38 +++
 src/runtime/NEON/functions/NEAccumulate.cpp        |  61 ++++
 src/runtime/NEON/functions/NEActivationLayer.cpp   |  36 +++
 .../NEON/functions/NEArithmeticAddition.cpp        |  38 +++
 .../NEON/functions/NEArithmeticSubtraction.cpp     |  38 +++
 .../NEON/functions/NEBatchNormalizationLayer.cpp   |  49 +++
 src/runtime/NEON/functions/NEBitwiseAnd.cpp        |  38 +++
 src/runtime/NEON/functions/NEBitwiseNot.cpp        |  38 +++
 src/runtime/NEON/functions/NEBitwiseOr.cpp         |  38 +++
 src/runtime/NEON/functions/NEBitwiseXor.cpp        |  38 +++
 src/runtime/NEON/functions/NEBox3x3.cpp            |  49 +++
 src/runtime/NEON/functions/NECannyEdge.cpp         | 169 ++++++++++
 src/runtime/NEON/functions/NEChannelCombine.cpp    |  45 +++
 src/runtime/NEON/functions/NEChannelExtract.cpp    |  45 +++
 src/runtime/NEON/functions/NEColorConvert.cpp      |  59 ++++
 src/runtime/NEON/functions/NEConvolution.cpp       | 120 +++++++
 src/runtime/NEON/functions/NEConvolutionLayer.cpp  | 246 +++++++++++++++
 src/runtime/NEON/functions/NEDepthConcatenate.cpp  |  67 ++++
 src/runtime/NEON/functions/NEDepthConvert.cpp      |  44 +++
 src/runtime/NEON/functions/NEDerivative.cpp        |  52 ++++
 src/runtime/NEON/functions/NEDilate.cpp            |  40 +++
 .../NEON/functions/NEDirectConvolutionLayer.cpp    |  75 +++++
 src/runtime/NEON/functions/NEEqualizeHistogram.cpp |  62 ++++
 src/runtime/NEON/functions/NEErode.cpp             |  40 +++
 src/runtime/NEON/functions/NEFastCorners.cpp       | 101 ++++++
 src/runtime/NEON/functions/NEFillBorder.cpp        |  39 +++
 .../NEON/functions/NEFullyConnectedLayer.cpp       | 344 +++++++++++++++++++++
 src/runtime/NEON/functions/NEGEMM.cpp              | 156 ++++++++++
 src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp |  36 +++
 src/runtime/NEON/functions/NEGEMMLowp.cpp          |  84 +++++
 src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp  |  40 +++
 src/runtime/NEON/functions/NEGaussian3x3.cpp       |  40 +++
 src/runtime/NEON/functions/NEGaussian5x5.cpp       |  60 ++++
 src/runtime/NEON/functions/NEGaussianPyramid.cpp   | 183 +++++++++++
 src/runtime/NEON/functions/NEHOGDescriptor.cpp     |  99 ++++++
 src/runtime/NEON/functions/NEHOGDetector.cpp       |  36 +++
 src/runtime/NEON/functions/NEHOGGradient.cpp       |  80 +++++
 src/runtime/NEON/functions/NEHOGMultiDetection.cpp | 231 ++++++++++++++
 src/runtime/NEON/functions/NEHarrisCorners.cpp     | 212 +++++++++++++
 src/runtime/NEON/functions/NEHistogram.cpp         |  58 ++++
 src/runtime/NEON/functions/NEIntegralImage.cpp     |  40 +++
 src/runtime/NEON/functions/NELaplacianPyramid.cpp  | 102 ++++++
 .../NEON/functions/NELaplacianReconstruct.cpp      | 100 ++++++
 .../NEON/functions/NELocallyConnectedLayer.cpp     | 131 ++++++++
 src/runtime/NEON/functions/NEMagnitude.cpp         |  48 +++
 src/runtime/NEON/functions/NEMeanStdDev.cpp        |  47 +++
 src/runtime/NEON/functions/NEMedian3x3.cpp         |  40 +++
 src/runtime/NEON/functions/NEMinMaxLocation.cpp    |  50 +++
 src/runtime/NEON/functions/NENonLinearFilter.cpp   |  42 +++
 .../NEON/functions/NENonMaximaSuppression3x3.cpp   |  47 +++
 .../NEON/functions/NENormalizationLayer.cpp        |  61 ++++
 src/runtime/NEON/functions/NEOpticalFlow.cpp       | 119 +++++++
 src/runtime/NEON/functions/NEPhase.cpp             |  38 +++
 .../NEON/functions/NEPixelWiseMultiplication.cpp   |  38 +++
 src/runtime/NEON/functions/NEPoolingLayer.cpp      |  41 +++
 src/runtime/NEON/functions/NERemap.cpp             |  53 ++++
 src/runtime/NEON/functions/NEScale.cpp             | 171 ++++++++++
 src/runtime/NEON/functions/NEScharr3x3.cpp         |  40 +++
 src/runtime/NEON/functions/NESobel3x3.cpp          |  40 +++
 src/runtime/NEON/functions/NESobel5x5.cpp          |  81 +++++
 src/runtime/NEON/functions/NESobel7x7.cpp          |  81 +++++
 src/runtime/NEON/functions/NESoftmaxLayer.cpp      |  72 +++++
 src/runtime/NEON/functions/NETableLookup.cpp       |  38 +++
 src/runtime/NEON/functions/NEThreshold.cpp         |  38 +++
 src/runtime/NEON/functions/NETranspose.cpp         |  38 +++
 src/runtime/NEON/functions/NEWarpAffine.cpp        |  62 ++++
 src/runtime/NEON/functions/NEWarpPerspective.cpp   |  62 ++++
 src/runtime/OMP/OMPScheduler.cpp                   |  83 +++++
 src/runtime/Pyramid.cpp                            | 120 +++++++
 src/runtime/Scheduler.cpp                          | 149 +++++++++
 src/runtime/SubTensor.cpp                          |  57 ++++
 src/runtime/Tensor.cpp                             |  51 +++
 src/runtime/TensorAllocator.cpp                    | 119 +++++++
 src/runtime/Utils.cpp                              |  42 +++
 162 files changed, 12392 insertions(+)
 create mode 100644 src/runtime/CL/CLDistribution1D.cpp
 create mode 100644 src/runtime/CL/CLHOG.cpp
 create mode 100644 src/runtime/CL/CLLut.cpp
 create mode 100644 src/runtime/CL/CLLutAllocator.cpp
 create mode 100644 src/runtime/CL/CLMultiHOG.cpp
 create mode 100644 src/runtime/CL/CLMultiImage.cpp
 create mode 100644 src/runtime/CL/CLPyramid.cpp
 create mode 100644 src/runtime/CL/CLScheduler.cpp
 create mode 100644 src/runtime/CL/CLSubTensor.cpp
 create mode 100644 src/runtime/CL/CLTensor.cpp
 create mode 100644 src/runtime/CL/CLTensorAllocator.cpp
 create mode 100644 src/runtime/CL/ICLSimpleFunction.cpp
 create mode 100644 src/runtime/CL/functions/CLAbsoluteDifference.cpp
 create mode 100644 src/runtime/CL/functions/CLAccumulate.cpp
 create mode 100644 src/runtime/CL/functions/CLActivationLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLArithmeticAddition.cpp
 create mode 100644 src/runtime/CL/functions/CLArithmeticSubtraction.cpp
 create mode 100644 src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLBitwiseAnd.cpp
 create mode 100644 src/runtime/CL/functions/CLBitwiseNot.cpp
 create mode 100644 src/runtime/CL/functions/CLBitwiseOr.cpp
 create mode 100644 src/runtime/CL/functions/CLBitwiseXor.cpp
 create mode 100644 src/runtime/CL/functions/CLBox3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLCannyEdge.cpp
 create mode 100644 src/runtime/CL/functions/CLChannelCombine.cpp
 create mode 100644 src/runtime/CL/functions/CLChannelExtract.cpp
 create mode 100644 src/runtime/CL/functions/CLColorConvert.cpp
 create mode 100644 src/runtime/CL/functions/CLConvolution.cpp
 create mode 100644 src/runtime/CL/functions/CLConvolutionLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLDepthConcatenate.cpp
 create mode 100644 src/runtime/CL/functions/CLDepthConvert.cpp
 create mode 100644 src/runtime/CL/functions/CLDerivative.cpp
 create mode 100644 src/runtime/CL/functions/CLDilate.cpp
 create mode 100644 src/runtime/CL/functions/CLEqualizeHistogram.cpp
 create mode 100644 src/runtime/CL/functions/CLErode.cpp
 create mode 100644 src/runtime/CL/functions/CLFastCorners.cpp
 create mode 100644 src/runtime/CL/functions/CLFillBorder.cpp
 create mode 100644 src/runtime/CL/functions/CLFullyConnectedLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLGEMM.cpp
 create mode 100644 src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
 create mode 100644 src/runtime/CL/functions/CLGEMMLowp.cpp
 create mode 100644 src/runtime/CL/functions/CLGaussian3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLGaussian5x5.cpp
 create mode 100644 src/runtime/CL/functions/CLGaussianPyramid.cpp
 create mode 100644 src/runtime/CL/functions/CLHOGDescriptor.cpp
 create mode 100644 src/runtime/CL/functions/CLHOGDetector.cpp
 create mode 100644 src/runtime/CL/functions/CLHOGGradient.cpp
 create mode 100644 src/runtime/CL/functions/CLHOGMultiDetection.cpp
 create mode 100644 src/runtime/CL/functions/CLHarrisCorners.cpp
 create mode 100644 src/runtime/CL/functions/CLHistogram.cpp
 create mode 100644 src/runtime/CL/functions/CLIntegralImage.cpp
 create mode 100644 src/runtime/CL/functions/CLLaplacianPyramid.cpp
 create mode 100644 src/runtime/CL/functions/CLLaplacianReconstruct.cpp
 create mode 100644 src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLMagnitude.cpp
 create mode 100644 src/runtime/CL/functions/CLMeanStdDev.cpp
 create mode 100644 src/runtime/CL/functions/CLMedian3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLMinMaxLocation.cpp
 create mode 100644 src/runtime/CL/functions/CLNonLinearFilter.cpp
 create mode 100644 src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLNormalizationLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLOpticalFlow.cpp
 create mode 100644 src/runtime/CL/functions/CLPhase.cpp
 create mode 100644 src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
 create mode 100644 src/runtime/CL/functions/CLPoolingLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLRemap.cpp
 create mode 100644 src/runtime/CL/functions/CLScale.cpp
 create mode 100644 src/runtime/CL/functions/CLScharr3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLSobel3x3.cpp
 create mode 100644 src/runtime/CL/functions/CLSobel5x5.cpp
 create mode 100644 src/runtime/CL/functions/CLSobel7x7.cpp
 create mode 100644 src/runtime/CL/functions/CLSoftmaxLayer.cpp
 create mode 100644 src/runtime/CL/functions/CLTableLookup.cpp
 create mode 100644 src/runtime/CL/functions/CLThreshold.cpp
 create mode 100644 src/runtime/CL/functions/CLTranspose.cpp
 create mode 100644 src/runtime/CL/functions/CLWarpAffine.cpp
 create mode 100644 src/runtime/CL/functions/CLWarpPerspective.cpp
 create mode 100644 src/runtime/CPP/CPPScheduler.cpp
 create mode 100644 src/runtime/CPP/SingleThreadScheduler.cpp
 create mode 100644 src/runtime/Distribution1D.cpp
 create mode 100644 src/runtime/HOG.cpp
 create mode 100644 src/runtime/ILutAllocator.cpp
 create mode 100644 src/runtime/ITensorAllocator.cpp
 create mode 100644 src/runtime/Lut.cpp
 create mode 100644 src/runtime/LutAllocator.cpp
 create mode 100644 src/runtime/MultiHOG.cpp
 create mode 100644 src/runtime/MultiImage.cpp
 create mode 100644 src/runtime/NEON/INESimpleFunction.cpp
 create mode 100644 src/runtime/NEON/functions/NEAbsoluteDifference.cpp
 create mode 100644 src/runtime/NEON/functions/NEAccumulate.cpp
 create mode 100644 src/runtime/NEON/functions/NEActivationLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEArithmeticAddition.cpp
 create mode 100644 src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
 create mode 100644 src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEBitwiseAnd.cpp
 create mode 100644 src/runtime/NEON/functions/NEBitwiseNot.cpp
 create mode 100644 src/runtime/NEON/functions/NEBitwiseOr.cpp
 create mode 100644 src/runtime/NEON/functions/NEBitwiseXor.cpp
 create mode 100644 src/runtime/NEON/functions/NEBox3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NECannyEdge.cpp
 create mode 100644 src/runtime/NEON/functions/NEChannelCombine.cpp
 create mode 100644 src/runtime/NEON/functions/NEChannelExtract.cpp
 create mode 100644 src/runtime/NEON/functions/NEColorConvert.cpp
 create mode 100644 src/runtime/NEON/functions/NEConvolution.cpp
 create mode 100644 src/runtime/NEON/functions/NEConvolutionLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEDepthConcatenate.cpp
 create mode 100644 src/runtime/NEON/functions/NEDepthConvert.cpp
 create mode 100644 src/runtime/NEON/functions/NEDerivative.cpp
 create mode 100644 src/runtime/NEON/functions/NEDilate.cpp
 create mode 100644 src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEEqualizeHistogram.cpp
 create mode 100644 src/runtime/NEON/functions/NEErode.cpp
 create mode 100644 src/runtime/NEON/functions/NEFastCorners.cpp
 create mode 100644 src/runtime/NEON/functions/NEFillBorder.cpp
 create mode 100644 src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEGEMM.cpp
 create mode 100644 src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
 create mode 100644 src/runtime/NEON/functions/NEGEMMLowp.cpp
 create mode 100644 src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
 create mode 100644 src/runtime/NEON/functions/NEGaussian3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NEGaussian5x5.cpp
 create mode 100644 src/runtime/NEON/functions/NEGaussianPyramid.cpp
 create mode 100644 src/runtime/NEON/functions/NEHOGDescriptor.cpp
 create mode 100644 src/runtime/NEON/functions/NEHOGDetector.cpp
 create mode 100644 src/runtime/NEON/functions/NEHOGGradient.cpp
 create mode 100644 src/runtime/NEON/functions/NEHOGMultiDetection.cpp
 create mode 100644 src/runtime/NEON/functions/NEHarrisCorners.cpp
 create mode 100644 src/runtime/NEON/functions/NEHistogram.cpp
 create mode 100644 src/runtime/NEON/functions/NEIntegralImage.cpp
 create mode 100644 src/runtime/NEON/functions/NELaplacianPyramid.cpp
 create mode 100644 src/runtime/NEON/functions/NELaplacianReconstruct.cpp
 create mode 100644 src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEMagnitude.cpp
 create mode 100644 src/runtime/NEON/functions/NEMeanStdDev.cpp
 create mode 100644 src/runtime/NEON/functions/NEMedian3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NEMinMaxLocation.cpp
 create mode 100644 src/runtime/NEON/functions/NENonLinearFilter.cpp
 create mode 100644 src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NENormalizationLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NEOpticalFlow.cpp
 create mode 100644 src/runtime/NEON/functions/NEPhase.cpp
 create mode 100644 src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
 create mode 100644 src/runtime/NEON/functions/NEPoolingLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NERemap.cpp
 create mode 100644 src/runtime/NEON/functions/NEScale.cpp
 create mode 100644 src/runtime/NEON/functions/NEScharr3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NESobel3x3.cpp
 create mode 100644 src/runtime/NEON/functions/NESobel5x5.cpp
 create mode 100644 src/runtime/NEON/functions/NESobel7x7.cpp
 create mode 100644 src/runtime/NEON/functions/NESoftmaxLayer.cpp
 create mode 100644 src/runtime/NEON/functions/NETableLookup.cpp
 create mode 100644 src/runtime/NEON/functions/NEThreshold.cpp
 create mode 100644 src/runtime/NEON/functions/NETranspose.cpp
 create mode 100644 src/runtime/NEON/functions/NEWarpAffine.cpp
 create mode 100644 src/runtime/NEON/functions/NEWarpPerspective.cpp
 create mode 100644 src/runtime/OMP/OMPScheduler.cpp
 create mode 100644 src/runtime/Pyramid.cpp
 create mode 100644 src/runtime/Scheduler.cpp
 create mode 100644 src/runtime/SubTensor.cpp
 create mode 100644 src/runtime/Tensor.cpp
 create mode 100644 src/runtime/TensorAllocator.cpp
 create mode 100644 src/runtime/Utils.cpp

(limited to 'src/runtime')

diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp
new file mode 100644
index 0000000000..f1dd95e77e
--- /dev/null
+++ b/src/runtime/CL/CLDistribution1D.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDistribution1D::CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : ICLDistribution1D(num_bins, offset, range), _mem(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, num_bins * sizeof(int32_t))
+{
+}
+
+void CLDistribution1D::map(bool blocking)
+{
+    ICLDistribution1D::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLDistribution1D::unmap()
+{
+    ICLDistribution1D::unmap(CLScheduler::get().queue());
+}
+
+uint32_t *CLDistribution1D::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    return static_cast<uint32_t *>(q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLDistribution1D::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    q.enqueueUnmapMemObject(_mem, _mapping);
+}
+
+cl::Buffer &CLDistribution1D::cl_buffer()
+{
+    return _mem;
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000000..3f5266ce70
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+    : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _info   = input;
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+    return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+    return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+    ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+    ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, descriptor());
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp
new file mode 100644
index 0000000000..a8cbf2131f
--- /dev/null
+++ b/src/runtime/CL/CLLut.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLut.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+CLLut::CLLut()
+    : _allocator()
+{
+}
+
+CLLut::CLLut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t CLLut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t CLLut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t CLLut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType CLLut::type() const
+{
+    return _allocator.type();
+}
+
+const cl::Buffer &CLLut::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+void CLLut::clear()
+{
+    cl::CommandQueue &q    = CLScheduler::get().queue();
+    uint8_t          *data = _allocator.map(q, true /* blocking */);
+    std::memset(data, 0, size_in_bytes());
+    _allocator.unmap(q, data);
+}
+
+ILutAllocator *CLLut::allocator()
+{
+    return &_allocator;
+}
+
+void CLLut::map(bool blocking)
+{
+    ICLLut::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLLut::unmap()
+{
+    ICLLut::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLLut::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLLut::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp
new file mode 100644
index 0000000000..311de4bb8d
--- /dev/null
+++ b/src/runtime/CL/CLLutAllocator.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLutAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLLutAllocator::CLLutAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLLutAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLLutAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+uint8_t *CLLutAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLLutAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
+
+void CLLutAllocator::allocate()
+{
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size());
+}
+
+uint8_t *CLLutAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    _mapping           = map(q, true);
+    return _mapping;
+}
+
+void CLLutAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    unmap(q, _mapping);
+    _mapping = nullptr;
+}
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000000..b9e8739454
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
new file mode 100644
index 0000000000..63059cb5f4
--- /dev/null
+++ b/src/runtime/CL/CLMultiImage.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLMultiImage::CLMultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *CLMultiImage::info() const
+{
+    return &_info;
+}
+
+void CLMultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void CLMultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void CLMultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+CLImage *CLMultiImage::cl_plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const CLImage *CLMultiImage::cl_plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
new file mode 100644
index 0000000000..41d81ea0f8
--- /dev/null
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <array>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLPyramid::CLPyramid()
+    : _info(), _pyramid(nullptr)
+{
+}
+
+void CLPyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void CLPyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const std::array<float, 4> c_orbscale =
+    {
+        {
+            0.5f,
+            SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+        }
+    };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        _pyramid[i].allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            const float orb_scale = c_orbscale[(i + 1) % 4];
+            w                     = std::ceil(ref_w * orb_scale);
+            h                     = std::ceil(ref_h * orb_scale);
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void CLPyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *CLPyramid::info() const
+{
+    return &_info;
+}
+
+CLTensor *CLPyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
new file mode 100644
index 0000000000..fe25ce534c
--- /dev/null
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+using namespace arm_compute;
+
+CLScheduler::CLScheduler()
+    : _context(), _queue(), _target(GPUTarget::MIDGARD)
+{
+}
+
+CLScheduler &CLScheduler::get()
+{
+    static CLScheduler scheduler;
+    return scheduler;
+}
+
+void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+{
+    kernel.run(kernel.window(), _queue);
+
+    if(flush)
+    {
+        _queue.flush();
+    }
+}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000000..b228c0abda
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+    return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+    return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
new file mode 100644
index 0000000000..eefa0331d5
--- /dev/null
+++ b/src/runtime/CL/CLTensor.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensor::CLTensor()
+    : _allocator()
+{
+}
+
+TensorInfo *CLTensor::info() const
+{
+    return &_allocator.info();
+}
+
+TensorInfo *CLTensor::info()
+{
+    return &_allocator.info();
+}
+
+const cl::Buffer &CLTensor::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+ITensorAllocator *CLTensor::allocator()
+{
+    return &_allocator;
+}
+
+void CLTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLTensor::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
new file mode 100644
index 0000000000..8112a7148f
--- /dev/null
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensorAllocator::CLTensorAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLTensorAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLTensorAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+void CLTensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    info().set_is_resizable(false);
+}
+
+void CLTensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+    info().set_is_resizable(true);
+}
+
+uint8_t *CLTensorAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    _mapping = map(CLScheduler::get().queue(), true);
+    return _mapping;
+}
+
+void CLTensorAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    unmap(CLScheduler::get().queue(), _mapping);
+    _mapping = nullptr;
+}
+
+uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+}
+
+void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
new file mode 100644
index 0000000000..aa45743d37
--- /dev/null
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+ICLSimpleFunction::ICLSimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void ICLSimpleFunction::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
+
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_kernel);
+}
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
new file mode 100644
index 0000000000..5097dd4710
--- /dev/null
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
new file mode 100644
index 0000000000..56c519984c
--- /dev/null
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
+
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+    k->configure(input, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+    k->configure(input, alpha, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+    k->configure(input, shift, accum);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
new file mode 100644
index 0000000000..9b5bd8b663
--- /dev/null
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+    k->configure(input, output, act_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
new file mode 100644
index 0000000000..36bff4285c
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..97f0a1caf4
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..3df673c6a6
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
new file mode 100644
index 0000000000..7c85043206
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
new file mode 100644
index 0000000000..17ae5dea3c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
new file mode 100644
index 0000000000..c84a279bae
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
new file mode 100644
index 0000000000..fd49c7d818
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
new file mode 100644
index 0000000000..8de6807c73
--- /dev/null
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
new file mode 100644
index 0000000000..1d018b8347
--- /dev/null
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+using namespace arm_compute;
+
+CLCannyEdge::CLCannyEdge()
+    : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+{
+}
+
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+
+    const unsigned int L1_hysteresis_stack_size = 8;
+    const TensorShape  shape                    = input->info()->tensor_shape();
+
+    TensorInfo gradient_info;
+    TensorInfo info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S16);
+        info.init(shape, 1, arm_compute::DataType::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S32);
+        info.init(shape, 1, arm_compute::DataType::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _mag.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
+    _phase.allocator()->init(info_u8);
+    _l1_list_counter.allocator()->init(info_u8);
+
+    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
+    _visited.allocator()->init(info_u32);
+    _recorded.allocator()->init(info_u32);
+
+    TensorShape shape_l1_stack = input->info()->tensor_shape();
+    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
+    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
+    _l1_stack.allocator()->init(info_s32);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+    }
+
+    // Configure gradient
+    _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _mag.allocator()->allocate();
+    _visited.allocator()->allocate();
+    _recorded.allocator()->allocate();
+    _l1_stack.allocator()->allocate();
+    _l1_list_counter.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLCannyEdge::run()
+{
+    // Run sobel
+    _sobel->run();
+
+    // Run phase and magnitude calculation
+    CLScheduler::get().enqueue(_gradient, false);
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    CLScheduler::get().enqueue(_border_mag_gradient, false);
+
+    // Run non max suppresion
+    _nonmax.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_non_max_suppr, false);
+
+    // Clear temporary structures and run edge trace
+    _visited.clear(CLScheduler::get().queue());
+    _recorded.clear(CLScheduler::get().queue());
+    _l1_list_counter.clear(CLScheduler::get().queue());
+    _l1_stack.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_edge_trace, true);
+}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
new file mode 100644
index 0000000000..79a3676bd7
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
new file mode 100644
index 0000000000..2c6174b9ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
new file mode 100644
index 0000000000..2fe465aeb8
--- /dev/null
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
new file mode 100644
index 0000000000..21b5d47679
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    int16_t conv_col[matrix_size];
+    int16_t conv_row[matrix_size];
+    _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+
+    if(_is_separable)
+    {
+        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+        // Allocate intermediate buffer
+        _tmp.allocator()->allocate();
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   CLConvolutionSquare<matrix_size>::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+
+    if(_is_separable)
+    {
+        CLScheduler::get().enqueue(_kernel_hor, false);
+        CLScheduler::get().enqueue(_kernel_vert);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_kernel);
+    }
+}
+
+template class arm_compute::CLConvolutionSquare<5>;
+template class arm_compute::CLConvolutionSquare<7>;
+template class arm_compute::CLConvolutionSquare<9>;
+
+void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
new file mode 100644
index 0000000000..f0bbc3514f
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        const DataType     dt = weights->info()->data_type();
+        TensorInfo         info_wr(shape_wr, 1, dt);
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    if(_transpose1xW)
+    {
+        CLScheduler::get().enqueue(_weights_transposed_kernel);
+    }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    // Create tensor to store the reshaped weights
+    size_t mat_weights_cols = weights->info()->dimension(3);
+    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+            weights = &_weights_reshaped;
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
+            _weights_transposed.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+            weights = &_weights_transposed;
+        }
+    }
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+
+    if(!_are_weights_reshaped)
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            _weights_transposed.allocator()->allocate();
+        }
+        else
+        {
+            _weights_reshaped.allocator()->allocate();
+        }
+    }
+
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void CLConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+    if(!_is_fully_connected_convolution)
+    {
+        CLScheduler::get().enqueue(_input_interleave_kernel);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000000..d967d9865f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void CLDepthConcatenate::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
new file mode 100644
index 0000000000..edcd4928ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
new file mode 100644
index 0000000000..c51cb4c333
--- /dev/null
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
new file mode 100644
index 0000000000..345f47763c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDilate.h"
+
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
new file mode 100644
index 0000000000..3b182d31b6
--- /dev/null
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
+
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <numeric>
+
+using namespace arm_compute;
+
+namespace
+{
+void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
+{
+    dist.map(true);
+    cum_dist.map(true);
+    lut.map(true);
+
+    const uint32_t *dist_ptr     = dist.buffer();
+    uint32_t       *cum_dist_ptr = cum_dist.buffer();
+    uint8_t        *lut_ptr      = lut.buffer();
+
+    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
+
+    // Calculate cumulative distribution
+    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
+
+    // Get the number of pixels that have the lowest value in the input image
+    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
+    {
+        return v > 0;
+    });
+    const size_t image_size = cum_dist_ptr[255];
+
+    if(image_size == num_lowest_pixels)
+    {
+        std::iota(lut_ptr, lut_ptr + 256, 0);
+    }
+    else
+    {
+        const float diff = image_size - num_lowest_pixels;
+
+        for(size_t i = 0; i < 256; ++i)
+        {
+            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
+        }
+    }
+
+    dist.unmap();
+    cum_dist.unmap();
+    lut.unmap();
+}
+} // namespace
+
+CLEqualizeHistogram::CLEqualizeHistogram()
+    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
+{
+    _histogram_kernel.configure(input, &_hist);
+    _border_histogram_kernel.configure(input, &_hist);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void CLEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    CLScheduler::get().enqueue(_histogram_kernel, false);
+
+    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
+    CLScheduler::get().enqueue(_border_histogram_kernel, false);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
+
+    // Map input to output using created LUT.
+    CLScheduler::get().enqueue(_map_histogram_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
new file mode 100644
index 0000000000..b4c50e465a
--- /dev/null
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLErode.h"
+
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
new file mode 100644
index 0000000000..d2903fb849
--- /dev/null
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <algorithm>
+#include <cstring>
+
+using namespace arm_compute;
+
+CLFastCorners::CLFastCorners()
+    : _fast_corners_kernel(),
+      _suppr_func(),
+      _copy_array_kernel(),
+      _output(),
+      _suppr(),
+      _win(),
+      _non_max(false),
+      _num_corners(nullptr),
+      _num_buffer(),
+      _corners(nullptr),
+      _constant_border_value(0)
+{
+}
+
+void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *const corners,
+                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
+    _output.allocator()->init(tensor_info);
+
+    _non_max               = nonmax_suppression;
+    _num_corners           = num_corners;
+    _corners               = corners;
+    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+    _constant_border_value = constant_border_value;
+
+    const bool update_number = (nullptr != _num_corners);
+
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
+
+    if(!_non_max)
+    {
+        _copy_array_kernel.configure(&_output, update_number, corners, &_num_buffer);
+    }
+    else
+    {
+        _suppr.allocator()->init(tensor_info);
+
+        _suppr_func.configure(&_output, &_suppr, border_mode);
+        _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
+
+        _suppr.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void CLFastCorners::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    if(_non_max)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
+        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
+        memset(out_buffer, 0, _output.info()->total_size());
+        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
+    }
+
+    CLScheduler::get().enqueue(_fast_corners_kernel, false);
+
+    if(_non_max)
+    {
+        _suppr_func.run();
+    }
+
+    CLScheduler::get().enqueue(_copy_array_kernel, false);
+
+    unsigned int get_num_corners = 0;
+    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
+
+    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
+
+    _corners->resize(corner_size);
+
+    if(_num_corners != nullptr)
+    {
+        *_num_corners = get_num_corners;
+    }
+
+    q.flush();
+}
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
new file mode 100644
index 0000000000..9e59b771d8
--- /dev/null
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
+    k->configure(tensor, border_width, border_mode, constant_border_value);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..57d57d517f
--- /dev/null
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+    }
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_transpose1xW_kernel);
+    }
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ICLTensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void CLFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if it comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        CLScheduler::get().enqueue(_im2col_kernel, false);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_interleave4x4_kernel, false);
+    }
+
+    // Run matrix multiply
+    CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        CLScheduler::get().enqueue(_accumulate_biases_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
new file mode 100644
index 0000000000..7408054127
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLGEMM::CLGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if(a->info()->dimension(1) != 1)
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        if(DataType::F32 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+        }
+        else if(DataType::F16 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("DataType not supported");
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiply kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+
+        // Allocate intermediate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+    else // The first input tensor is a vector
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, output, alpha);
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+void CLGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        CLScheduler::get().enqueue(_interleave_kernel, false);
+
+        // Run transpose kernel
+        CLScheduler::get().enqueue(_transpose_kernel, false);
+    }
+
+    // Run matrix multiply kernel
+    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        CLScheduler::get().enqueue(_ma_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..9dc77156ef
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
new file mode 100644
index 0000000000..45e011d8ce
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowp::CLGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    // Create shape for interleaved temporary tensor
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+
+    // Create shape for tranposed temporary tensor
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_b.allocator()->init(info_b);
+
+    // Configure kernels
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    // Allocate intermediate buffers
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void CLGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    CLScheduler::get().enqueue(_interleave_kernel, false);
+
+    /* Run transpose kernel */
+    CLScheduler::get().enqueue(_transpose_kernel, false);
+
+    /* Run matrix multiply kernel */
+    CLScheduler::get().enqueue(_mm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
new file mode 100644
index 0000000000..362a3fe920
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
new file mode 100644
index 0000000000..e83a8fb857
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+CLGaussian5x5::CLGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+{
+}
+
+void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
+
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+}
+
+void CLGaussian5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_kernel_hor, false);
+    CLScheduler::get().enqueue(_kernel_vert);
+}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
new file mode 100644
index 0000000000..8a4279e99b
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLGaussianPyramid::CLGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+CLGaussianPyramidHalf::CLGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        CLScheduler::get().enqueue(_border_handler[i], false);
+        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(_vertical_reduction[i], false);
+    }
+}
+
+CLGaussianPyramidOrb::CLGaussianPyramidOrb()
+    : _gauss5x5(), _scale_nearest()
+{
+}
+
+void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gauss5x5      = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure gaussian 5x5 */
+            _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED);
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gauss5x5[i].run();
+        CLScheduler::get().enqueue(_scale_nearest[i]);
+    }
+}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000000..b1b5a03ac1
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Intitialize tensors for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning
+    CLScheduler::get().enqueue(_orient_bin, false);
+
+    // Run block normalization
+    CLScheduler::get().enqueue(_block_norm);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000000..8eb5e4251f
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    _detection_windows = detection_windows;
+
+    // Allocate buffer for storing the number of detected objects
+    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+    // Configure HOGDetectorKernel
+    _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset number of detections
+    const unsigned int init_num_detection_windows = _detection_windows->num_values();
+    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+    // Run CLHOGDetectorKernel
+    CLScheduler::get().enqueue(_hog_detector_kernel);
+
+    // Read number of detections
+    unsigned int num_detection_windows = 0;
+    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+    // Update the number of values stored in _detection_windows
+    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+    q.flush();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000000..2387474358
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+    : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+    }
+    else
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    CLScheduler::get().enqueue(_mag_phase);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000000..b8f2224ac8
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure CLTensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    detection_window_strides->map(CLScheduler::get().queue(), true);
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    detection_window_strides->unmap(CLScheduler::get().queue());
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void CLHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        // Map detection windows array before computing non maxima suppression
+        _detection_windows->map(CLScheduler::get().queue(), true);
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        _detection_windows->unmap(CLScheduler::get().queue());
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
new file mode 100644
index 0000000000..2db277fa4d
--- /dev/null
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+CLHarrisCorners::CLHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+      _corners(nullptr)
+{
+}
+
+void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+
+    _corners = corners;
+
+    const TensorShape shape = input->info()->tensor_shape();
+    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
+    TensorInfo        tensor_info(shape, 1, dt);
+    _gx.allocator()->init(tensor_info);
+    _gy.allocator()->init(tensor_info);
+
+    TensorInfo info_f32(shape, 1, DataType::F32);
+    _score.allocator()->init(info_f32);
+    _nonmax.allocator()->init(info_f32);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    /* Set/init Sobel kernel accordingly with gradient_size */
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
+    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
+
+    // Normalization factor
+    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+    const float pow4_normalization_factor = pow(norm_factor, 4);
+
+    // Set/init Harris Score kernel accordingly with block_size
+    _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+
+    // Allocate intermediate buffers
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    CLScheduler::get().enqueue(_border_gx, false);
+    CLScheduler::get().enqueue(_border_gy, false);
+
+    // Run harris score kernel
+    CLScheduler::get().enqueue(_harris_score, false);
+
+    // Run non-maxima suppression
+    CLScheduler::get().enqueue(_non_max_suppr);
+
+    // Run corner candidate kernel
+    _nonmax.map(true);
+    Scheduler::get().schedule(&_candidates, Window::DimY);
+    _nonmax.unmap();
+
+    _corners->map(CLScheduler::get().queue(), true);
+    _sort_euclidean.run(_sort_euclidean.window());
+    _corners->unmap(CLScheduler::get().queue());
+}
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
new file mode 100644
index 0000000000..eb543387f5
--- /dev/null
+++ b/src/runtime/CL/functions/CLHistogram.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHistogram::CLHistogram()
+    : _kernel(), _kernel_border()
+{
+}
+
+void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    _kernel.configure(input, output);
+    _kernel_border.configure(input, output);
+}
+
+void CLHistogram::run()
+{
+    CLScheduler::get().enqueue(_kernel, false);
+    CLScheduler::get().enqueue(_kernel_border);
+}
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
new file mode 100644
index 0000000000..2d54be32fa
--- /dev/null
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
+
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLIntegralImage::CLIntegralImage()
+    : _integral_hor(), _integral_vert()
+{
+}
+
+void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
+{
+    _integral_hor.configure(input, output);
+    _integral_vert.configure(output);
+}
+
+void CLIntegralImage::run()
+{
+    CLScheduler::get().enqueue(_integral_hor, false);
+    CLScheduler::get().enqueue(_integral_vert);
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
new file mode 100644
index 0000000000..d7ce20653d
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+using namespace arm_compute;
+
+CLLaplacianPyramid::CLLaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+{
+}
+
+void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
+
+void CLLaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    _gaussian_pyr_function.run(); // compute gaussian pyramid
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].run(); // convolute gaussian pyramid
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _subf[i].run(); // compute laplacian image
+    }
+
+    _depth_function.run();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
new file mode 100644
index 0000000000..1dfab740d7
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLLaplacianReconstruct::CLLaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void CLLaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..263fb51987
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+
+    // Runs vector matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
new file mode 100644
index 0000000000..51088cb71f
--- /dev/null
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, output, nullptr, mag_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
new file mode 100644
index 0000000000..56ba146790
--- /dev/null
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLMeanStdDev::CLMeanStdDev()
+    : _mean_stddev_kernel(),
+      _global_sum(),
+      _global_sum_squared()
+{
+}
+
+void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+{
+    _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+
+    if(stddev != nullptr)
+    {
+        _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+    }
+
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void CLMeanStdDev::run()
+{
+    CLScheduler::get().enqueue(_mean_stddev_kernel);
+}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
new file mode 100644
index 0000000000..0c10f9aa08
--- /dev/null
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
new file mode 100644
index 0000000000..ad783d8a53
--- /dev/null
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+using namespace arm_compute;
+
+CLMinMaxLocation::CLMinMaxLocation()
+    : _min_max_kernel(),
+      _min_max_loc_kernel(),
+      _min_max_vals(),
+      _min_max_count_vals(),
+      _min(nullptr),
+      _max(nullptr),
+      _min_count(nullptr),
+      _max_count(nullptr),
+      _min_loc(nullptr),
+      _max_loc(nullptr)
+{
+}
+
+void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
+    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
+    _min                = min;
+    _max                = max;
+    _min_count          = min_count;
+    _max_count          = max_count;
+    _min_loc            = min_loc;
+    _max_loc            = max_loc;
+
+    _min_max_kernel.configure(input, &_min_max_vals);
+    _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+}
+
+void CLMinMaxLocation::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    CLScheduler::get().enqueue(_min_max_kernel, false);
+    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
+
+    // Update min and max
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+
+    // Update min and max count
+    if(_min_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
+    }
+    if(_max_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
+    }
+
+    // Update min/max point arrays (Makes the kernel blocking)
+    if(_min_loc != nullptr)
+    {
+        unsigned int min_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
+        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
+        _min_loc->resize(min_corner_size);
+    }
+    if(_max_loc != nullptr)
+    {
+        unsigned int max_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
+        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
+        _max_loc->resize(max_corner_size);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
new file mode 100644
index 0000000000..b593a6cced
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
+
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..ca7d5aede7
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT);
+    }
+    else
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
new file mode 100644
index 0000000000..2d89ebd676
--- /dev/null
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayer::CLNormalizationLayer()
+    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+{
+}
+
+void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+
+    _norm_kernel.configure(input, &_squared_input, output, norm_info);
+    _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate buffers
+    _squared_input.allocator()->allocate();
+}
+
+void CLNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_multiply_kernel, false);
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_norm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
new file mode 100644
index 0000000000..a6b0eb3bec
--- /dev/null
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+using namespace arm_compute;
+
+CLOpticalFlow::CLOpticalFlow()
+    : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
+      _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+{
+}
+
+void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    // Set member variables
+    _old_points           = old_points;
+    _new_points_estimates = new_points_estimates;
+    _new_points           = new_points;
+    _num_levels           = old_pyramid->info()->num_levels();
+
+    const float pyr_scale              = old_pyramid->info()->scale();
+    const int   list_length            = old_points->num_values();
+    const int   old_values_list_length = list_length * window_dimension * window_dimension;
+
+    // Create kernels and tensors
+    _tracker_init_kernel   = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+    _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+    _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+    _func_scharr           = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+    _scharr_gx             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _scharr_gy             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+
+    // Create internal keypoint arrays
+    _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _old_points_internal->resize(list_length);
+    _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _new_points_internal->resize(list_length);
+    _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+    _coefficient_table->resize(list_length);
+    _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+    _old_values->resize(old_values_list_length);
+    _new_points->resize(list_length);
+
+    for(size_t i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        // Initialize Scharr tensors
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
+
+        // Init Lucas-Kanade init kernel
+        _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+
+        // Init Lucas-Kanade stage0 kernel
+        _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            window_dimension, i);
+
+        // Init Lucas-Kanade stage1 kernel
+        _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            termination, epsilon, num_iterations, window_dimension, i);
+
+        // Allocate intermediate buffers
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+
+    // Finalize Lucas-Kanade
+    _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points);
+}
+
+void CLOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade init kernel
+        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage0 kernel
+        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage1 kernel
+        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
+    }
+
+    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
new file mode 100644
index 0000000000..a8cb22b06e
--- /dev/null
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPhase.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..8a86c2e203
--- /dev/null
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
new file mode 100644
index 0000000000..1ef70f4a2b
--- /dev/null
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
new file mode 100644
index 0000000000..f6b1713c58
--- /dev/null
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRemap.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+    k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
new file mode 100644
index 0000000000..043f873028
--- /dev/null
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(output == input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
new file mode 100644
index 0000000000..c8bc465be6
--- /dev/null
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
new file mode 100644
index 0000000000..6b74ebaedb
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
new file mode 100644
index 0000000000..098b546c1a
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel5x5::CLSobel5x5()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
new file mode 100644
index 0000000000..db84fa99ae
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel7x7::CLSobel7x7()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel7x7::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
new file mode 100644
index 0000000000..2a78c58053
--- /dev/null
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSoftmaxLayer::CLSoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+
+    // Create intermediate tensors shapes
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void CLSoftmaxLayer::run()
+{
+    CLScheduler::get().enqueue(_max_kernel, false);
+    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
new file mode 100644
index 0000000000..743ed5e73e
--- /dev/null
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
new file mode 100644
index 0000000000..e70f932d66
--- /dev/null
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
new file mode 100644
index 0000000000..d802b4fe77
--- /dev/null
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
new file mode 100644
index 0000000000..537e0d9397
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
new file mode 100644
index 0000000000..a552ab480d
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
new file mode 100644
index 0000000000..886933074d
--- /dev/null
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <iostream>
+#include <semaphore.h>
+#include <system_error>
+#include <thread>
+
+using namespace arm_compute;
+
+class arm_compute::Thread
+{
+public:
+    /** Start a new thread
+     */
+    Thread();
+    Thread(const Thread &) = delete;
+    Thread &operator=(const Thread &) = delete;
+    Thread(Thread &&)                 = delete;
+    Thread &operator=(Thread &&) = delete;
+    /** Make the thread join
+     */
+    ~Thread();
+    /** Request the worker thread to start executing the given kernel
+     * This function will return as soon as the kernel has been sent to the worker thread.
+     * wait() needs to be called to ensure the execution is complete.
+     */
+    void start(ICPPKernel *kernel, const Window &window);
+    /** Wait for the current kernel execution to complete
+     */
+    void wait();
+    /** Function ran by the worker thread
+     */
+    void worker_thread();
+
+private:
+    std::thread        _thread;
+    ICPPKernel        *_kernel{ nullptr };
+    Window             _window;
+    sem_t              _wait_for_work;
+    sem_t              _job_complete;
+    std::exception_ptr _current_exception;
+};
+
+Thread::Thread()
+    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+{
+    int ret = sem_init(&_wait_for_work, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_init(&_job_complete, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    _thread = std::thread(&Thread::worker_thread, this);
+}
+
+Thread::~Thread()
+{
+    ARM_COMPUTE_ERROR_ON(!_thread.joinable());
+
+    start(nullptr, Window());
+    _thread.join();
+
+    int ret = sem_destroy(&_wait_for_work);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_destroy(&_job_complete);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+}
+
+void Thread::start(ICPPKernel *kernel, const Window &window)
+{
+    _kernel = kernel;
+    _window = window;
+    int ret = sem_post(&_wait_for_work);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+}
+
+void Thread::wait()
+{
+    int ret = sem_wait(&_job_complete);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    if(_current_exception)
+    {
+        std::rethrow_exception(_current_exception);
+    }
+}
+
+void Thread::worker_thread()
+{
+    while(sem_wait(&_wait_for_work) >= 0)
+    {
+        _current_exception = nullptr;
+        // Time to exit
+        if(_kernel == nullptr)
+        {
+            return;
+        }
+
+        try
+        {
+            _window.validate();
+            _kernel->run(_window);
+        }
+        catch(...)
+        {
+            _current_exception = std::current_exception();
+        }
+        int ret = sem_post(&_job_complete);
+        ARM_COMPUTE_UNUSED(ret);
+        ARM_COMPUTE_ERROR_ON(ret < 0);
+    }
+
+    ARM_COMPUTE_ERROR("Wait failed");
+}
+
+namespace
+{
+void delete_threads(Thread *t)
+{
+    delete[] t;
+}
+} // namespace
+
+CPPScheduler &CPPScheduler::get()
+{
+    static CPPScheduler scheduler;
+    return scheduler;
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+CPPScheduler::CPPScheduler()
+    : _num_threads(std::thread::hardware_concurrency()),
+      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+{
+}
+
+void CPPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = std::thread::hardware_concurrency();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    /** [Scheduler example] */
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        for(unsigned int t = 0; t < num_threads; ++t)
+        {
+            Window win = max_window.split_window(split_dimension, t, num_threads);
+            win.set_thread_id(t);
+            win.set_num_threads(num_threads);
+
+            if(t != num_threads - 1)
+            {
+                _threads[t].start(kernel, win);
+            }
+            else
+            {
+                kernel->run(win);
+            }
+        }
+
+        try
+        {
+            for(unsigned int t = 1; t < num_threads; ++t)
+            {
+                _threads[t - 1].wait();
+            }
+        }
+        catch(const std::system_error &e)
+        {
+            std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+        }
+    }
+    /** [Scheduler example] */
+}
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
new file mode 100644
index 0000000000..f086813e91
--- /dev/null
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+SingleThreadScheduler &SingleThreadScheduler::get()
+{
+    static SingleThreadScheduler scheduler;
+    return scheduler;
+}
+
+void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
+{
+    ARM_COMPUTE_UNUSED(num_threads);
+}
+
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_UNUSED(split_dimension);
+    kernel->run(kernel->window());
+}
+
+unsigned int SingleThreadScheduler::num_threads() const
+{
+    return 1;
+}
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
new file mode 100644
index 0000000000..b06767499b
--- /dev/null
+++ b/src/runtime/Distribution1D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Distribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+{
+}
+
+uint32_t *Distribution1D::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == _data);
+    return _data.get();
+}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
new file mode 100644
index 0000000000..5d533dded4
--- /dev/null
+++ b/src/runtime/HOG.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/HOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+HOG::HOG()
+    : IHOG(), _info(), _descriptor(nullptr)
+{
+}
+
+void HOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
+    _info       = input;
+    _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+}
+
+float *HOG::descriptor() const
+{
+    return _descriptor.get();
+}
+
+const HOGInfo *HOG::info() const
+{
+    return &_info;
+}
diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp
new file mode 100644
index 0000000000..fb961638f1
--- /dev/null
+++ b/src/runtime/ILutAllocator.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ILutAllocator::ILutAllocator()
+    : _num_elements(0), _data_type(DataType::U8)
+{
+}
+
+void ILutAllocator::init(size_t num_elements, DataType data_type)
+{
+    // Init internal metadata
+    _num_elements = num_elements;
+    _data_type    = data_type;
+
+    // Allocate the image's memory
+    allocate();
+}
+
+size_t ILutAllocator::num_elements() const
+{
+    return _num_elements;
+}
+
+DataType ILutAllocator::type() const
+{
+    return _data_type;
+}
+
+size_t ILutAllocator::size() const
+{
+    return data_size_from_type(_data_type) * num_elements();
+}
diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
new file mode 100644
index 0000000000..8294201384
--- /dev/null
+++ b/src/runtime/ITensorAllocator.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ITensorAllocator::ITensorAllocator()
+    : _info()
+{
+}
+
+void ITensorAllocator::init(const TensorInfo &input)
+{
+    _info = input;
+}
+
+TensorInfo &ITensorAllocator::info()
+{
+    return _info;
+}
+
+const TensorInfo &ITensorAllocator::info() const
+{
+    return _info;
+}
diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp
new file mode 100644
index 0000000000..1b3daf1f60
--- /dev/null
+++ b/src/runtime/Lut.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Lut.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+Lut::Lut()
+    : _allocator()
+{
+}
+
+Lut::Lut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t Lut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t Lut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t Lut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType Lut::type() const
+{
+    return _allocator.type();
+}
+
+uint8_t *Lut::buffer() const
+{
+    return _allocator.data();
+}
+
+void Lut::clear()
+{
+    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
+    std::memset(this->buffer(), 0, this->size_in_bytes());
+}
+
+ILutAllocator *Lut::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
new file mode 100644
index 0000000000..17baf21f45
--- /dev/null
+++ b/src/runtime/LutAllocator.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/LutAllocator.h"
+
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+LutAllocator::LutAllocator()
+    : _buffer(nullptr)
+{
+}
+
+uint8_t *LutAllocator::data() const
+{
+    return _buffer.get();
+}
+
+void LutAllocator::allocate()
+{
+    _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+}
+
+uint8_t *LutAllocator::lock()
+{
+    return _buffer.get();
+}
+
+void LutAllocator::unlock()
+{
+}
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
new file mode 100644
index 0000000000..003dc93895
--- /dev/null
+++ b/src/runtime/MultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiHOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+using namespace arm_compute;
+
+MultiHOG::MultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+{
+}
+
+size_t MultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+IHOG *MultiHOG::model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const IHOG *MultiHOG::model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
new file mode 100644
index 0000000000..def1487c5e
--- /dev/null
+++ b/src/runtime/MultiImage.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+MultiImage::MultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *MultiImage::info() const
+{
+    return &_info;
+}
+
+void MultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void MultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void MultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void MultiImage::create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height)
+{
+    arm_compute::Format format = image->info()->format();
+    const TensorInfo    info(width, height, Format::U8);
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F32:
+        case Format::F16:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            const TensorInfo info_full(width, height, format);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            const TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            const TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_sub2);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(2))->allocator(), coords, info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+Image *MultiImage::plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const Image *MultiImage::plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
new file mode 100644
index 0000000000..6f0da85fc8
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+INESimpleFunction::INESimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void INESimpleFunction::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
new file mode 100644
index 0000000000..b39feb3a2b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
new file mode 100644
index 0000000000..c39abfc540
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAccumulate::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+}
+
+void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+    k->configure(input, shift, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
new file mode 100644
index 0000000000..f5d81d7cd8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+    k->configure(input, output, activation_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
new file mode 100644
index 0000000000..50cc38b489
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..a3d27c0ed6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..a24429c6de
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    // Configure kernel
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
new file mode 100644
index 0000000000..5aafc51dc0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
new file mode 100644
index 0000000000..af3df6e46a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
new file mode 100644
index 0000000000..d12c5e5f6f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
new file mode 100644
index 0000000000..65c943e64c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
new file mode 100644
index 0000000000..7f0b45d34c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
new file mode 100644
index 0000000000..26f31f557b
--- /dev/null
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstring>
+#include <utility>
+
+using namespace arm_compute;
+
+NECannyEdge::NECannyEdge()
+    : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+{
+}
+
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
+                            bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(gradient_size < 3);
+    ARM_COMPUTE_ERROR_ON(gradient_size > 7);
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+
+    _output = output;
+
+    const TensorShape &shape = input->info()->tensor_shape();
+    TensorInfo         gradient_info;
+    TensorInfo         magnitude_info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, Format::S16);
+        magnitude_info.init(shape, Format::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, Format::S32);
+        magnitude_info.init(shape, Format::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _magnitude.allocator()->init(magnitude_info);
+
+    TensorInfo info(shape, Format::U8);
+    _phase.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient size not supported\n");
+    }
+
+    // Configure gradient
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output);
+
+    // Fill border with "No edge" to stop recursion in edge trace
+    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NECannyEdge::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+
+    // Run sobelNxN
+    _sobel->run();
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    _border_mag_gradient.run(_border_mag_gradient.window());
+
+    // Run gradient
+    NEScheduler::get().schedule(_gradient.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
+
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->info()->total_size());
+
+    // Fill border before edge trace
+    _border_edge_trace.run(_border_edge_trace.window());
+
+    // Run edge tracing
+    _edge_trace.run(_edge_trace.window());
+}
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
new file mode 100644
index 0000000000..84d4fff4ff
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
new file mode 100644
index 0000000000..634e918eac
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
new file mode 100644
index 0000000000..bbaa832284
--- /dev/null
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEColorConvert::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
new file mode 100644
index 0000000000..3f39ae2cbd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <array>
+#include <utility>
+
+using namespace arm_compute;
+
+void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    std::array<int16_t, matrix_size> conv_col{ { 0 } };
+    std::array<int16_t, matrix_size> conv_row{ { 0 } };
+
+    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
+
+    if(_is_separable)
+    {
+        DataType intermediate_type = DataType::UNKNOWN;
+        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
+
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
+
+        _tmp.allocator()->allocate();
+
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   NEConvolutionSquare<matrix_size>::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    if(_is_separable)
+    {
+        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+    }
+    else
+    {
+        NEScheduler::get().schedule(&_kernel, Window::DimY);
+    }
+}
+
+template class arm_compute::NEConvolutionSquare<5>;
+template class arm_compute::NEConvolutionSquare<7>;
+template class arm_compute::NEConvolutionSquare<9>;
+
+void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
new file mode 100644
index 0000000000..bd688cffb6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    if(_transpose1xW)
+    {
+        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+    }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+        }
+        weights = &_weights_reshaped;
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void NEConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_is_fully_connected_convolution)
+    {
+        // Run interleave
+        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000000..7d2c5494a9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs             = inputs_vector.size();
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+    unsigned int depth_offset = 0;
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void NEDepthConcatenate::run()
+{
+    for(unsigned i = 0; i < _num_inputs; ++i)
+    {
+        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
new file mode 100644
index 0000000000..a339cae316
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
+
+    auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
new file mode 100644
index 0000000000..2887c13233
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDerivative::NEDerivative()
+    : _kernel(), _border_handler()
+{
+}
+
+void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
+
+void NEDerivative::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
new file mode 100644
index 0000000000..0c016f14f9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDilate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..3f3e7710fb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    // Allocate the intermediate accumulator tensor in case of fixed point input
+    if(output->info()->data_type() == DataType::QS8)
+    {
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        _accumulate_bias_kernel.configure(output, bias);
+    }
+
+    // Add zero padding XY
+    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+    _input_border_handler.run(_input_border_handler.window());
+
+    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
new file mode 100644
index 0000000000..f6ec677e44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEEqualizeHistogram::NEEqualizeHistogram()
+    : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void NEEqualizeHistogram::configure(const IImage *input, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Configure kernels
+    _histogram_kernel.configure(input, &_hist);
+    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void NEEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+
+    // Map input to output using created LUT.
+    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
new file mode 100644
index 0000000000..9b011db845
--- /dev/null
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEErode.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
new file mode 100644
index 0000000000..33a58f1904
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEFastCorners::NEFastCorners()
+    : _fast_corners_kernel(),
+      _border_handler(),
+      _nonmax_kernel(),
+      _fill_kernel(),
+      _output(),
+      _suppressed(),
+      _non_max(false)
+{
+}
+
+void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    _non_max = nonmax_suppression;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
+    _output.allocator()->init(tensor_info);
+
+    // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
+    // width - 3) and ywindow (3, height -3) so the output image will leave the
+    // pixels on the borders unchanged. This is reflected in the valid region
+    // of the output. The non maxima suppression is only run on the valid
+    // pixels.
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
+    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
+
+    if(!_non_max)
+    {
+        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
+    }
+    else
+    {
+        _suppressed.allocator()->init(tensor_info);
+        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
+        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+
+        // Allocate intermediate tensors
+        _suppressed.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void NEFastCorners::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
+
+    if(_non_max)
+    {
+        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
+    }
+
+    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
new file mode 100644
index 0000000000..e884f4a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
+
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    _border_handler.configure(input, border_width, border_mode, constant_border_value);
+}
+
+void NEFillBorder::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..abb41e9f70
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+    }
+}
+
+NEFullyConnectedLayer::NEFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ITensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void NEFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply
+    NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
new file mode 100644
index 0000000000..15d5f4effb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+NEGEMM::NEGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if((a->info()->dimension(1) == 1))
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, d, alpha);
+    }
+    else
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        switch(a->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+                break;
+            }
+            case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                {
+                    shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+                    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+                    break;
+                }
+#endif
+            case DataType::QS8:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR_ON("Data type not supported");
+            }
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiplication kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+        // Allocate once the all configure methods have been called
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, d, beta);
+        _run_addition = true;
+    }
+}
+
+void NEGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+        // Run transpose kernel
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply kernel
+    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..4c77c88656
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+using namespace arm_compute;
+
+void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
new file mode 100644
index 0000000000..b64f769459
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGEMMLowp::NEGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+    _tmp_b.allocator()->init(info_b);
+
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void NEGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+    /* Run transpose kernel */
+    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+
+    /* Run matrix multiply kernel */
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
new file mode 100644
index 0000000000..dc40ecec14
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
new file mode 100644
index 0000000000..95ba5cbdf9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
new file mode 100644
index 0000000000..5ccc765966
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGaussian5x5::NEGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+{
+}
+
+void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    // Init temporary buffer
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+    _tmp.allocator()->init(tensor_info);
+
+    // Create and configure kernels for the two passes
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+
+    _tmp.allocator()->allocate();
+
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEGaussian5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
new file mode 100644
index 0000000000..e1d64f11f6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NEGaussianPyramid::NEGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+NEGaussianPyramidHalf::NEGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+    }
+}
+
+NEGaussianPyramidOrb::NEGaussianPyramidOrb()
+    : _offsets(), _gaus5x5(), _scale_nearest()
+{
+}
+
+void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gaus5x5       = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
+        _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            const size_t width  = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
+            const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
+
+            /* Allocate Image for the offsets used by NEAREST interpolation */
+            TensorInfo tensor_info(TensorShape(width, height), Format::S32);
+            _offsets[i].allocator()->init(tensor_info);
+
+            /* Configure gaussian 5x5 */
+            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
+                                        border_mode == BorderMode::UNDEFINED);
+
+            _offsets[i].allocator()->allocate();
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gaus5x5[i].run();
+        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
new file mode 100644
index 0000000000..a592f53d44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGDescriptor::NEHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Allocate memory for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void NEHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning kernel
+    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
+
+    // Run block normalization kernel
+    NEScheduler::get().schedule(&_block_norm, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
new file mode 100644
index 0000000000..e8ed29d0b9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+
+using namespace arm_compute;
+
+void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+    k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
+    _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
new file mode 100644
index 0000000000..2f4b8802e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGGradient::NEHOGGradient()
+    : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+{
+}
+
+void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void NEHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
new file mode 100644
index 0000000000..173b8f4c42
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NEHOGMultiDetection::NEHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) NEHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of NEHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure NETensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void NEHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
new file mode 100644
index 0000000000..b54fb67ab7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+NEHarrisCorners::NEHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+{
+}
+
+void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+
+    const TensorShape shape = input->info()->tensor_shape();
+    TensorInfo        tensor_info_gxgy;
+
+    if(gradient_size < 7)
+    {
+        tensor_info_gxgy.init(shape, Format::S16);
+    }
+    else
+    {
+        tensor_info_gxgy.init(shape, Format::S32);
+    }
+
+    _gx.allocator()->init(tensor_info_gxgy);
+    _gy.allocator()->init(tensor_info_gxgy);
+
+    TensorInfo tensor_info_score(shape, Format::F32);
+    _score.allocator()->init(tensor_info_score);
+    _nonmax.allocator()->init(tensor_info_score);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    // Set/init Sobel kernel accordingly with gradient_size
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Normalization factor
+    const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+
+    if(use_fp16)
+    {
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+    else
+    {
+        // Set/init Harris Score kernel accordingly with block_size
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NEHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    _border_gx.run(_border_gx.window());
+    _border_gy.run(_border_gy.window());
+
+    // Run harris score kernel
+    NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    _non_max_suppr.run();
+
+    // Run corner candidate kernel
+    NEScheduler::get().schedule(&_candidates, Window::DimY);
+
+    // Run sort & euclidean distance
+    _sort_euclidean.run(_sort_euclidean.window());
+}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
new file mode 100644
index 0000000000..c42b2a56e0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHistogram::NEHistogram()
+    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+{
+}
+
+void NEHistogram::configure(const IImage *input, IDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Allocate space for threads local histograms
+    _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
+    _local_hist      = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+
+    // Configure kernel
+    _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+}
+
+void NEHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
new file mode 100644
index 0000000000..af604e9295
--- /dev/null
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEIntegralImage::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
new file mode 100644
index 0000000000..8232c79f2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NELaplacianPyramid::NELaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+{
+}
+
+void NELaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    // Compute Gaussian Pyramid
+    _gaussian_pyr_function.run();
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Apply Gaussian filter to gaussian pyramid image
+        _convf[i].run();
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Compute laplacian image
+        _subf[i].run();
+    }
+
+    _depth_function.run();
+}
+
+void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
new file mode 100644
index 0000000000..36ac4a74d1
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NELaplacianReconstruct::NELaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void NELaplacianReconstruct::configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void NELaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..85d7ba3650
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+    // Runs GEMM on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
new file mode 100644
index 0000000000..9390ca2b6a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
new file mode 100644
index 0000000000..47143f5e5b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMeanStdDev::NEMeanStdDev()
+    : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+{
+}
+
+void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+{
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void NEMeanStdDev::run()
+{
+    _global_sum         = 0;
+    _global_sum_squared = 0;
+
+    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
new file mode 100644
index 0000000000..aa7cc97081
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
new file mode 100644
index 0000000000..cab9200cf8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMinMaxLocation::NEMinMaxLocation()
+    : _min_max(), _min_max_loc()
+{
+}
+
+void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    _min_max.configure(input, min, max);
+    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
+}
+
+void NEMinMaxLocation::run()
+{
+    _min_max.reset();
+
+    /* Run min max kernel */
+    NEScheduler::get().schedule(&_min_max, Window::DimY);
+
+    /* Run min max location */
+    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
new file mode 100644
index 0000000000..01aea3b671
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode,
+                                  uint8_t    constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..a7b3759a45
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+    }
+    else
+    {
+        _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+    }
+}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
new file mode 100644
index 0000000000..69ff32591f
--- /dev/null
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NENormalizationLayer::NENormalizationLayer()
+    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+{
+}
+
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_squared.allocator()->init(tensor_info);
+
+    // Configure kernels
+    _norm_kernel.configure(input, &_input_squared, output, norm_info);
+    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f));
+
+    // Allocate the tensor once the configure methods have been called
+    _input_squared.allocator()->allocate();
+}
+
+void NENormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
new file mode 100644
index 0000000000..49135e442c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEOpticalFlow::NEOpticalFlow()
+    : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+      _num_levels(0)
+{
+}
+
+void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
+                              IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                              bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    _num_levels           = old_pyramid->info()->num_levels();
+    _old_points           = old_points;
+    _new_points           = new_points;
+    _new_points_estimates = new_points_estimates;
+
+    const float pyr_scale = old_pyramid->info()->scale();
+
+    _func_scharr    = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+    _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+    _scharr_gx      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+    _scharr_gy      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+
+    _old_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points->resize(old_points->num_values());
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        IImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        IImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
+
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+
+        // Init Lucas-Kanade kernel
+        _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
+                                     old_points, new_points_estimates, new_points,
+                                     &_old_points_internal, &_new_points_internal,
+                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                     i, _num_levels, pyr_scale);
+
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+}
+
+void NEOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade kernel
+        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
new file mode 100644
index 0000000000..7683f461d3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPhase.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+    k->configure(input1, input2, nullptr, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..056d33b370
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
new file mode 100644
index 0000000000..6f0cc4f160
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
new file mode 100644
index 0000000000..9f06fb699c
--- /dev/null
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+
+    k->configure(input, map_x, map_y, output, policy);
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
new file mode 100644
index 0000000000..b70f626df0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
+    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
+
+    if(dx != nullptr && dy != nullptr)
+    {
+        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
+        Iterator offsets_it(offsets, win);
+        Iterator dx_it(dx, win);
+        Iterator dy_it(dy, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
+            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const int   in_xi = std::floor(in_x);
+            const int   in_yi = std::floor(in_y);
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+        },
+        offsets_it, dx_it, dy_it);
+    }
+    else
+    {
+        // Pre-compute the offset for NEAREST interpolation
+        Iterator offsets_it(offsets, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const size_t in_xi = (id.x() + 0.5f) * wr;
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+        },
+        offsets_it);
+    }
+}
+} // namespace
+
+NEScale::NEScale()
+    : _offsets(), _dx(), _dy()
+{
+}
+
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    // Get the tensor shape
+    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    // Get the element size of the input image
+    const size_t input_element_size = input->info()->element_size();
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+
+    auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
+
+    // Check if the border mode is UNDEFINED
+    const bool border_undefined = border_mode == BorderMode::UNDEFINED;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            _offsets.allocator()->init(tensor_info_offsets);
+
+            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+
+            // Pre-compute offsets for nearest interpolation
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            TensorInfo tensor_info_dxdy(shape, Format::F32);
+
+            _offsets.allocator()->init(tensor_info_offsets);
+            _dx.allocator()->init(tensor_info_dxdy);
+            _dy.allocator()->init(tensor_info_dxdy);
+
+            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+            _dx.allocator()->allocate();
+            _dy.allocator()->allocate();
+
+            // Pre-compute dx, dy and offsets for bilinear interpolation
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
new file mode 100644
index 0000000000..04b3f14ce7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
new file mode 100644
index 0000000000..3b46fd78c1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
new file mode 100644
index 0000000000..8967a22ba1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel5x5::NESobel5x5()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
new file mode 100644
index 0000000000..f628da9709
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel7x7::NESobel7x7()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel7x7::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
new file mode 100644
index 0000000000..0651eab1bc
--- /dev/null
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cfloat>
+
+using namespace arm_compute;
+
+NESoftmaxLayer::NESoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Create intermediate tensors shapes
+    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _tmp.allocator()->init(tensor_info_tmp);
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+
+    // Allocate intermediate tensors
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void NESoftmaxLayer::run()
+{
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
new file mode 100644
index 0000000000..ebb8a0ac9b
--- /dev/null
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
new file mode 100644
index 0000000000..93dc124880
--- /dev/null
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
new file mode 100644
index 0000000000..53ac9c5ee3
--- /dev/null
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETranspose::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
new file mode 100644
index 0000000000..24fb16f9e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpAffine::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
new file mode 100644
index 0000000000..84b2df5bfa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpPerspective::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
new file mode 100644
index 0000000000..0cced73276
--- /dev/null
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <omp.h>
+
+using namespace arm_compute;
+
+OMPScheduler &OMPScheduler::get()
+{
+    static OMPScheduler scheduler;
+    return scheduler;
+}
+
+OMPScheduler::OMPScheduler()
+    : _num_threads(omp_get_max_threads())
+{
+}
+
+unsigned int OMPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+void OMPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = omp_get_max_threads();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        #pragma omp parallel num_threads(num_threads)
+        {
+            #pragma omp for
+            for(unsigned int t = 0; t < num_threads; ++t)
+            {
+                Window win = max_window.split_window(split_dimension, t, num_threads);
+                win.set_thread_id(t);
+                win.set_num_threads(num_threads);
+                kernel->run(win);
+            }
+        }
+    }
+}
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
new file mode 100644
index 0000000000..f1b6c93b50
--- /dev/null
+++ b/src/runtime/Pyramid.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Pyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void Pyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void Pyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    bool        is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const float c_orbscale[4] = { 0.5f,
+                                  SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+                                };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        (_pyramid.get() + i)->allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            float orb_scale = c_orbscale[(i + 1) % 4];
+            w               = static_cast<int>(std::ceil(static_cast<float>(ref_w) * orb_scale));
+            h               = static_cast<int>(std::ceil(static_cast<float>(ref_h) * orb_scale));
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void Pyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *Pyramid::info() const
+{
+    return &_info;
+}
+
+Tensor *Pyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
new file mode 100644
index 0000000000..a131928293
--- /dev/null
+++ b/src/runtime/Scheduler.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "arm_compute/core/Error.h"
+#if ARM_COMPUTE_CPP_SCHEDULER
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+#endif
+
+using namespace arm_compute;
+
+#if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#else
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
+#endif
+
+void Scheduler::set(Type t)
+{
+    ARM_COMPUTE_ERROR_ON(!Scheduler::is_available(t));
+    _scheduler_type = t;
+}
+
+bool Scheduler::is_available(Type t)
+{
+    switch(t)
+    {
+        case Type::ST:
+        {
+            return true;
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::CUSTOM:
+        {
+            return _custom_scheduler != nullptr;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            return false;
+        }
+    }
+}
+
+Scheduler::Type Scheduler::get_type()
+{
+    return _scheduler_type;
+}
+
+IScheduler &Scheduler::get()
+{
+    switch(_scheduler_type)
+    {
+        case Type::ST:
+        {
+            return SingleThreadScheduler::get();
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return CPPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
+#endif
+            break;
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return OMPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
+#endif
+            break;
+        }
+        case Type::CUSTOM:
+        {
+            if(_custom_scheduler == nullptr)
+            {
+                ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            }
+            else
+            {
+                return *_custom_scheduler;
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            break;
+        }
+    }
+    return SingleThreadScheduler::get();
+}
+
+std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+
+void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+{
+    _custom_scheduler = scheduler;
+    set(Type::CUSTOM);
+}
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
new file mode 100644
index 0000000000..32924be3dc
--- /dev/null
+++ b/src/runtime/SubTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *SubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *SubTensor::info()
+{
+    return &_info;
+}
+
+uint8_t *SubTensor::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->buffer();
+}
+
+ITensor *SubTensor::parent()
+{
+    return _parent;
+}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
new file mode 100644
index 0000000000..435068c61d
--- /dev/null
+++ b/src/runtime/Tensor.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+Tensor::Tensor()
+    : _allocator()
+{
+}
+
+ITensorInfo *Tensor::info() const
+{
+    return &_allocator.info();
+}
+
+ITensorInfo *Tensor::info()
+{
+    return &_allocator.info();
+}
+
+uint8_t *Tensor::buffer() const
+{
+    return _allocator.data();
+}
+
+TensorAllocator *Tensor::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
new file mode 100644
index 0000000000..5c719c761a
--- /dev/null
+++ b/src/runtime/TensorAllocator.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace
+{
+bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &child_info, const Coordinates &coords)
+{
+    bool               is_valid     = true;
+    const TensorShape &parent_shape = parent_info.tensor_shape();
+    const TensorShape &child_shape  = child_info.tensor_shape();
+    const size_t       parent_dims  = parent_info.num_dimensions();
+    const size_t       child_dims   = child_info.num_dimensions();
+
+    if(child_dims <= parent_dims)
+    {
+        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        {
+            const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
+
+            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            {
+                is_valid = false;
+                break;
+            }
+        }
+    }
+    else
+    {
+        is_valid = false;
+    }
+
+    return is_valid;
+}
+} // namespace
+
+TensorAllocator::TensorAllocator()
+    : _buffer(nullptr)
+{
+}
+
+void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
+{
+    // Get parent info
+    const TensorInfo parent_info = allocator.info();
+
+    // Check if coordinates and new shape are within the parent tensor
+    ARM_COMPUTE_ERROR_ON(!validate_subtensor_shape(parent_info, sub_info, coords));
+    ARM_COMPUTE_UNUSED(validate_subtensor_shape);
+
+    // Copy pointer to buffer
+    _buffer = allocator._buffer;
+
+    // Init tensor info with new dimensions
+    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+
+    // Set TensorInfo
+    init(sub_info);
+}
+
+uint8_t *TensorAllocator::data() const
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+
+    _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+    info().set_is_resizable(false);
+}
+
+void TensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+
+    _buffer.reset();
+    info().set_is_resizable(true);
+}
+
+uint8_t *TensorAllocator::lock()
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::unlock()
+{
+}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
new file mode 100644
index 0000000000..1b06117c7b
--- /dev/null
+++ b/src/runtime/Utils.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Utils.h"
+
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
+{
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
+    {
+        { Scheduler::Type::ST, "Single Thread" },
+        { Scheduler::Type::CPP, "C++11 Threads" },
+        { Scheduler::Type::OMP, "OpenMP Threads" },
+        { Scheduler::Type::CUSTOM, "Custom" }
+    };
+
+    return scheduler_type_map[t];
+}
-- 
cgit v1.2.1