From bef7fa27b0d231a8649952f60808132d109b6345 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Wed, 21 Oct 2020 15:58:54 +0100
Subject: COMPMID-3639: (3RDPARTY_UPDATE) Move CL kernels to src

Change-Id: I10d27db788e5086adae1841e3e2441cd9b76ef84
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4310
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/runtime/CL/functions/CLAbsoluteDifference.cpp  |   2 +-
 src/runtime/CL/functions/CLAccumulate.cpp          |   2 +-
 src/runtime/CL/functions/CLActivationLayer.cpp     |   2 +-
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp      |  22 +++-
 .../CL/functions/CLBatchNormalizationLayer.cpp     |  14 +-
 src/runtime/CL/functions/CLBatchToSpaceLayer.cpp   |  16 ++-
 src/runtime/CL/functions/CLBitwiseAnd.cpp          |   2 +-
 src/runtime/CL/functions/CLBitwiseNot.cpp          |   2 +-
 src/runtime/CL/functions/CLBitwiseOr.cpp           |   2 +-
 src/runtime/CL/functions/CLBitwiseXor.cpp          |   2 +-
 .../CL/functions/CLBoundingBoxTransform.cpp        |   2 +-
 src/runtime/CL/functions/CLBox3x3.cpp              |   5 +-
 src/runtime/CL/functions/CLCannyEdge.cpp           |  30 +++--
 src/runtime/CL/functions/CLCast.cpp                |   2 +-
 src/runtime/CL/functions/CLChannelCombine.cpp      |   2 +-
 src/runtime/CL/functions/CLChannelExtract.cpp      |   2 +-
 src/runtime/CL/functions/CLChannelShuffleLayer.cpp |   2 +-
 src/runtime/CL/functions/CLColorConvert.cpp        |   2 +-
 src/runtime/CL/functions/CLComparison.cpp          |   7 +-
 src/runtime/CL/functions/CLComputeAllAnchors.cpp   |   1 +
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |  12 +-
 .../functions/CLConvertFullyConnectedWeights.cpp   |   2 +
 src/runtime/CL/functions/CLConvolution.cpp         |  32 +++--
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |   3 +-
 src/runtime/CL/functions/CLCopy.cpp                |   2 +-
 src/runtime/CL/functions/CLCropResize.cpp          |   6 +
 src/runtime/CL/functions/CLDeconvolutionLayer.cpp  |   1 -
 .../CL/functions/CLDeconvolutionLayerUpsample.cpp  |  17 ++-
 src/runtime/CL/functions/CLDepthConvertLayer.cpp   |   2 +-
 src/runtime/CL/functions/CLDepthToSpaceLayer.cpp   |   2 +-
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   |  52 ++++----
 src/runtime/CL/functions/CLDequantizationLayer.cpp |   2 +-
 src/runtime/CL/functions/CLDerivative.cpp          |   5 +-
 src/runtime/CL/functions/CLDilate.cpp              |   5 +-
 .../CL/functions/CLDirectConvolutionLayer.cpp      |  21 +--
 .../CL/functions/CLDirectDeconvolutionLayer.cpp    |   6 +
 .../CL/functions/CLElementWiseUnaryLayer.cpp       |   2 +-
 .../CL/functions/CLElementwiseOperations.cpp       |   2 +-
 src/runtime/CL/functions/CLEqualizeHistogram.cpp   |  24 +++-
 src/runtime/CL/functions/CLErode.cpp               |   5 +-
 src/runtime/CL/functions/CLFFT1D.cpp               |  30 +++--
 src/runtime/CL/functions/CLFFT2D.cpp               |   5 +
 src/runtime/CL/functions/CLFFTConvolutionLayer.cpp |   7 +
 src/runtime/CL/functions/CLFastCorners.cpp         |  20 +--
 src/runtime/CL/functions/CLFill.cpp                |   2 +-
 src/runtime/CL/functions/CLFillBorder.cpp          |   2 +-
 src/runtime/CL/functions/CLFlattenLayer.cpp        |   2 +-
 src/runtime/CL/functions/CLFloor.cpp               |   2 +-
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |  13 ++
 .../CL/functions/CLFuseBatchNormalization.cpp      |  10 +-
 src/runtime/CL/functions/CLGEMM.cpp                | 142 ++++++++++++++-------
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |  47 +++++--
 .../CL/functions/CLGEMMDeconvolutionLayer.cpp      |  25 +++-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  |  80 +++++++-----
 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp |   9 +-
 src/runtime/CL/functions/CLGather.cpp              |   2 +-
 src/runtime/CL/functions/CLGaussian3x3.cpp         |   5 +-
 src/runtime/CL/functions/CLGaussian5x5.cpp         |  24 ++--
 src/runtime/CL/functions/CLGaussianPyramid.cpp     |  47 ++++---
 .../CL/functions/CLGenerateProposalsLayer.cpp      |  57 +++++----
 src/runtime/CL/functions/CLHOGDescriptor.cpp       |  22 +++-
 src/runtime/CL/functions/CLHOGDetector.cpp         |  11 +-
 src/runtime/CL/functions/CLHOGGradient.cpp         |  15 ++-
 src/runtime/CL/functions/CLHOGMultiDetection.cpp   |  21 ++-
 src/runtime/CL/functions/CLHarrisCorners.cpp       |  26 ++--
 .../CL/functions/CLInstanceNormalizationLayer.cpp  |   4 +-
 src/runtime/CL/functions/CLIntegralImage.cpp       |  16 ++-
 src/runtime/CL/functions/CLL2NormalizeLayer.cpp    |  16 ++-
 src/runtime/CL/functions/CLLSTMLayer.cpp           |  57 ++++++---
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  |   8 ++
 src/runtime/CL/functions/CLLaplacianPyramid.cpp    |   3 +
 .../CL/functions/CLLaplacianReconstruct.cpp        |   2 +
 .../CL/functions/CLLocallyConnectedLayer.cpp       |  35 +++--
 src/runtime/CL/functions/CLMagnitude.cpp           |   2 +-
 src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp   |  17 ++-
 src/runtime/CL/functions/CLMeanStdDev.cpp          |  18 ++-
 .../functions/CLMeanStdDevNormalizationLayer.cpp   |   2 +-
 src/runtime/CL/functions/CLMedian3x3.cpp           |   5 +-
 src/runtime/CL/functions/CLMinMaxLocation.cpp      |  17 ++-
 src/runtime/CL/functions/CLNonLinearFilter.cpp     |   5 +-
 .../CL/functions/CLNonMaximaSuppression3x3.cpp     |   7 +-
 src/runtime/CL/functions/CLNormalizationLayer.cpp  |  17 ++-
 .../CL/functions/CLNormalizePlanarYUVLayer.cpp     |   2 +-
 src/runtime/CL/functions/CLOpticalFlow.cpp         |  38 +++---
 src/runtime/CL/functions/CLPReluLayer.cpp          |   2 +-
 src/runtime/CL/functions/CLPadLayer.cpp            |  17 ++-
 src/runtime/CL/functions/CLPermute.cpp             |   2 +-
 src/runtime/CL/functions/CLPhase.cpp               |   2 +-
 .../CL/functions/CLPixelWiseMultiplication.cpp     |  15 ++-
 src/runtime/CL/functions/CLPoolingLayer.cpp        |   5 +-
 src/runtime/CL/functions/CLPriorBoxLayer.cpp       |   4 +-
 src/runtime/CL/functions/CLQLSTMLayer.cpp          |  91 ++++++++++---
 src/runtime/CL/functions/CLQuantizationLayer.cpp   |   2 +-
 src/runtime/CL/functions/CLRNNLayer.cpp            |  24 +++-
 src/runtime/CL/functions/CLROIAlignLayer.cpp       |   3 +-
 src/runtime/CL/functions/CLROIPoolingLayer.cpp     |   4 +-
 src/runtime/CL/functions/CLRange.cpp               |   2 +-
 src/runtime/CL/functions/CLReduceMean.cpp          |   3 +-
 src/runtime/CL/functions/CLReductionOperation.cpp  |  42 ++++--
 src/runtime/CL/functions/CLRemap.cpp               |   5 +-
 src/runtime/CL/functions/CLReorgLayer.cpp          |   2 +-
 src/runtime/CL/functions/CLReshapeLayer.cpp        |   2 +-
 src/runtime/CL/functions/CLReverse.cpp             |   2 +-
 src/runtime/CL/functions/CLScale.cpp               |   5 +-
 src/runtime/CL/functions/CLScharr3x3.cpp           |   5 +-
 src/runtime/CL/functions/CLSelect.cpp              |   2 +-
 src/runtime/CL/functions/CLSlice.cpp               |   2 +-
 src/runtime/CL/functions/CLSobel3x3.cpp            |   7 +-
 src/runtime/CL/functions/CLSobel5x5.cpp            |  33 +++--
 src/runtime/CL/functions/CLSobel7x7.cpp            |  33 +++--
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |  30 +++--
 src/runtime/CL/functions/CLSpaceToBatchLayer.cpp   |  21 ++-
 src/runtime/CL/functions/CLSpaceToDepthLayer.cpp   |  10 +-
 src/runtime/CL/functions/CLStackLayer.cpp          |  11 +-
 src/runtime/CL/functions/CLStridedSlice.cpp        |   2 +-
 src/runtime/CL/functions/CLTableLookup.cpp         |   2 +-
 src/runtime/CL/functions/CLThreshold.cpp           |   2 +-
 src/runtime/CL/functions/CLTile.cpp                |   2 +-
 src/runtime/CL/functions/CLTranspose.cpp           |   2 +-
 src/runtime/CL/functions/CLUpsampleLayer.cpp       |  10 +-
 src/runtime/CL/functions/CLWarpAffine.cpp          |   5 +-
 src/runtime/CL/functions/CLWarpPerspective.cpp     |   5 +-
 .../CL/functions/CLWinogradConvolutionLayer.cpp    |  23 +++-
 .../CL/functions/CLWinogradInputTransform.cpp      |   5 +-
 src/runtime/CL/functions/CLYOLOLayer.cpp           |   2 +-
 125 files changed, 1106 insertions(+), 563 deletions(-)

(limited to 'src/runtime/CL/functions')

diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index d5d1bbdd7a..b7f40a516c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
 
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 2f06252446..742de64e34 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 5ddf227382..61c82b33eb 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 57c4f685f6..5fc849e3c5 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -30,8 +30,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -40,6 +42,8 @@ CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manage
 {
 }
 
+CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
+
 Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -124,13 +128,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+    _reduction_kernels_vector.reserve(_num_of_stages);
+
+    auto add_reduction_kernel = [this, &compile_context, axis, op](const ICLTensor * input, const ICLTensor * prev_output, ICLTensor * output)
+    {
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLArgMinMaxLayerKernel>());
+        _reduction_kernels_vector.back()->configure(compile_context, input, prev_output, output, axis, op);
+    };
 
     _memory_group.manage(&_not_reshaped_output);
     // Create temporary tensors
     if(_num_of_stages == 1)
     {
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op);
+        add_reduction_kernel(input, nullptr, &_not_reshaped_output);
     }
     else
     {
@@ -144,19 +154,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const
 
         // Apply ReductionOperation only on first kernel
         _memory_group.manage(&_results_vector[0]);
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op);
+        add_reduction_kernel(input, nullptr, &_results_vector[0]);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op);
+            add_reduction_kernel(input, &_results_vector[i - 1], &_results_vector[i]);
             _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage = _num_of_stages - 1;
-        _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
+        add_reduction_kernel(input, &_results_vector[last_stage - 1], &_not_reshaped_output);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
     _reshape.configure(compile_context, &_not_reshaped_output, output);
@@ -169,7 +179,7 @@ void CLArgMinMaxLayer::run()
 
     for(unsigned int i = 0; i < _num_of_stages; ++i)
     {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+        CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
     }
     _reshape.run();
 }
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 701add074e..77eed1140f 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -29,14 +29,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
+namespace arm_compute
+{
 CLBatchNormalizationLayer::CLBatchNormalizationLayer()
-    : _norm_kernel()
+    : _norm_kernel(support::cpp14::make_unique<CLBatchNormalizationLayerKernel>())
 {
 }
 
+CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
+
 void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
                                           ActivationLayerInfo act_info)
 {
@@ -47,7 +52,7 @@ void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_contex
                                           const ICLTensor *gamma, float epsilon,
                                           ActivationLayerInfo act_info)
 {
-    _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -60,5 +65,6 @@ Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITens
 
 void CLBatchNormalizationLayer::run()
 {
-    CLScheduler::get().enqueue(_norm_kernel, true);
+    CLScheduler::get().enqueue(*_norm_kernel, true);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 5ba3b5bc9c..e0a2c430ed 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,13 +30,18 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "support/MemorySupport.h"
 
+namespace arm_compute
+{
 CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel()
+    : _batch_to_space_kernel(support::cpp14::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
+CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default;
+
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
@@ -44,7 +49,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo
 
 void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
@@ -54,7 +59,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_
 
 void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output);
 }
 
 Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
@@ -69,5 +74,6 @@ Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_sha
 
 void CLBatchToSpaceLayer::run()
 {
-    CLScheduler::get().enqueue(_batch_to_space_kernel, true);
+    CLScheduler::get().enqueue(*_batch_to_space_kernel, true);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index cb49e61e84..cfcd63f170 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 22c575ca8d..588c793f6a 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index 4bbb8909fe..3a5de193a3 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index bc37f6eaab..62aeaaa31f 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 2384fc4132..600d36290c 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 0300899b59..be40f25055 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBox3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *inp
     auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index cd2d6b478a..5a32564d2d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -31,6 +31,10 @@
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -38,10 +42,10 @@ using namespace arm_compute;
 CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _sobel(),
-      _gradient(),
-      _border_mag_gradient(),
-      _non_max_suppr(),
-      _edge_trace(),
+      _gradient(support::cpp14::make_unique<CLGradientKernel>()),
+      _border_mag_gradient(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _non_max_suppr(support::cpp14::make_unique<CLEdgeNonMaxSuppressionKernel>()),
+      _edge_trace(support::cpp14::make_unique<CLEdgeTraceKernel>()),
       _gx(),
       _gy(),
       _mag(),
@@ -55,6 +59,8 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLI
 {
 }
 
+CLCannyEdge::~CLCannyEdge() = default;
+
 void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
                             uint8_t constant_border_value)
 {
@@ -143,7 +149,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_phase);
 
     // Configure gradient
-    _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
+    _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -153,14 +159,14 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_nonmax);
 
     // Configure non-maxima suppression
-    _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+    _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
 
     // Allocate intermediate buffers
     _phase.allocator()->allocate();
 
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+    _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value);
 
     // Allocate intermediate buffers
     _mag.allocator()->allocate();
@@ -172,7 +178,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_l1_list_counter);
 
     // Configure edge tracing
-    _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+    _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
 
     // Allocate intermediate buffers
     _visited.allocator()->allocate();
@@ -190,14 +196,14 @@ void CLCannyEdge::run()
     _sobel->run();
 
     // Run phase and magnitude calculation
-    CLScheduler::get().enqueue(_gradient, false);
+    CLScheduler::get().enqueue(*_gradient, false);
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    CLScheduler::get().enqueue(_border_mag_gradient, false);
+    CLScheduler::get().enqueue(*_border_mag_gradient, false);
 
     // Run non max suppresion
     _nonmax.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_non_max_suppr, false);
+    CLScheduler::get().enqueue(*_non_max_suppr, false);
 
     // Clear temporary structures and run edge trace
     _output->clear(CLScheduler::get().queue());
@@ -205,5 +211,5 @@ void CLCannyEdge::run()
     _recorded.clear(CLScheduler::get().queue());
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_edge_trace, true);
+    CLScheduler::get().enqueue(*_edge_trace, true);
 }
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 95cc0e9239..2a28e06845 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 326caa8c74..e93aea31f4 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index aa37af9988..8b4a3f7458 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index b79afdb3b4..c443df3b37 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2bbb30e24c..95f4257929 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 8c18b35583..9b5840aa95 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLComparison.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -47,7 +48,7 @@ void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -76,7 +77,7 @@ void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index be86fc4f78..2cae0ee455 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 2eb310b893..54f71f9765 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,19 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b291ae5b88..8ecc114343 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -22,6 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index bc962d0052..1ad32d309c 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -49,15 +50,20 @@ void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTen
     auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
     k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
 CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(support::cpp14::make_unique<CLSeparableConvolutionHorKernel<matrix_size>>()),
+      _kernel_vert(support::cpp14::make_unique<CLSeparableConvolutionVertKernel<matrix_size>>()), _kernel(support::cpp14::make_unique<CLConvolutionKernel<matrix_size>>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
+template <unsigned int matrix_size>
+CLConvolutionSquare<matrix_size>::~CLConvolutionSquare() = default;
+
 template <unsigned int matrix_size>
 void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
                                                  uint8_t constant_border_value)
@@ -88,35 +94,35 @@ void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 
         // Allocate intermediate buffer
         _tmp.allocator()->allocate();
     }
     else
     {
-        _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
     }
 }
 
 template <unsigned int matrix_size>
 void                   CLConvolutionSquare<matrix_size>::run()
 {
-    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(*_border_handler);
 
     if(_is_separable)
     {
         MemoryGroupResourceScope scope_mg(_memory_group);
 
-        CLScheduler::get().enqueue(_kernel_hor, false);
-        CLScheduler::get().enqueue(_kernel_vert);
+        CLScheduler::get().enqueue(*_kernel_hor, false);
+        CLScheduler::get().enqueue(*_kernel_vert);
     }
     else
     {
-        CLScheduler::get().enqueue(_kernel);
+        CLScheduler::get().enqueue(*_kernel);
     }
 }
 
@@ -135,5 +141,5 @@ void CLConvolutionRectangle::configure(const CLCompileContext &compile_context,
     auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
     k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 85355f0f17..e214bdf0f2 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -29,7 +29,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-
 #include "support/MemorySupport.h"
 
 #include <cmath>
@@ -45,6 +44,8 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 {
 }
 
+CLConvolutionLayer::~CLConvolutionLayer() = default;
+
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                    const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index acdc52d4f7..f7b016a779 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 4cf9f13a67..4aaa674c5c 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,6 +25,10 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -61,6 +65,8 @@ CLCropResize::CLCropResize()
 {
 }
 
+CLCropResize::~CLCropResize() = default;
+
 Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
                               Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
 {
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e6717b6d01..6fe231ea6c 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-
 #include "support/MemorySupport.h"
 
 #include <cmath>
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index eb1fb7fbdf..0cf2ea623f 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,16 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(),
-      _memset(),
+    : _upsample(support::cpp14::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
+      _memset(support::cpp14::make_unique<CLMemsetKernel>()),
       _output(nullptr)
 {
 }
 
+CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
+
 Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
@@ -52,13 +57,13 @@ void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_con
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
-    _upsample.configure(compile_context, input, _output, info);
+    _memset->configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _upsample->configure(compile_context, input, _output, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
 {
-    CLScheduler::get().enqueue(_memset, false);
-    CLScheduler::get().enqueue(_upsample, true);
+    CLScheduler::get().enqueue(*_memset, false);
+    CLScheduler::get().enqueue(*_upsample, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 141eb3fefc..e58c0e5f4c 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 8571056104..8dbd974ceb 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index bb0db2e7a7..2440384e3b 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -24,13 +24,19 @@
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -119,7 +125,7 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _dwc_native_kernel(),
+      _dwc_native_kernel(support::cpp14::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
       _permute_input_to_nhwc(),
       _permute_weights_to_nhwc(),
       _permute_output_to_nchw(),
@@ -137,6 +143,8 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv
 {
 }
 
+CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
+
 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
@@ -206,9 +214,9 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
     dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
     DWCKernelInfo dwc_info;
     dwc_info.activation_info = act_info;
-    _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                 dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                 output_multipliers_to_use, output_shifts_to_use);
+    _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
+                                  dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
+                                  output_multipliers_to_use, output_shifts_to_use);
 
     if(_needs_permute)
     {
@@ -302,7 +310,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
     {
         _permute_input_to_nhwc.run();
     }
-    CLScheduler::get().enqueue(_dwc_native_kernel);
+    CLScheduler::get().enqueue(*_dwc_native_kernel);
     if(_needs_permute)
     {
         _permute_output_to_nchw.run();
@@ -343,11 +351,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _kernel(nullptr),
-      _border_handler(),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
       _permute_input_to_nchw(),
       _permute_weights_to_nchw(),
       _permute_output_to_nhwc(),
-      _reshape_weights(),
+      _reshape_weights(support::cpp14::make_unique<CLDepthwiseConvolutionLayerReshapeWeightsKernel>()),
       _permuted_input(),
       _permuted_weights(),
       _permuted_output(),
@@ -378,14 +386,14 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(),
-                                                                        weights->info(),
-                                                                        biases != nullptr ? biases->info() : nullptr,
-                                                                        output->info(),
-                                                                        conv_info,
-                                                                        depth_multiplier,
-                                                                        act_info,
-                                                                        gpu_target,
-                                                                        dilation));
+                                                                                weights->info(),
+                                                                                biases != nullptr ? biases->info() : nullptr,
+                                                                                output->info(),
+                                                                                conv_info,
+                                                                                depth_multiplier,
+                                                                                act_info,
+                                                                                gpu_target,
+                                                                                dilation));
 
     const bool is_nhwc     = input->info()->data_layout() == DataLayout::NHWC;
     _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
@@ -434,7 +442,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         if(_needs_weights_reshape)
         {
-            _reshape_weights.configure(compile_context, weights, &_permuted_weights, info);
+            _reshape_weights->configure(compile_context, weights, &_permuted_weights, info);
             weights_to_use = &_permuted_weights;
         }
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
@@ -486,7 +494,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
-    _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler->configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -505,7 +513,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
     {
         _permute_input_to_nchw.run();
     }
-    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(*_border_handler);
     CLScheduler::get().enqueue(*_kernel);
 
     if(_needs_permute)
@@ -547,7 +555,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar
             ARM_COMPUTE_ERROR_ON(_needs_permute);
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
             _permuted_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_reshape_weights);
+            CLScheduler::get().enqueue(*_reshape_weights);
             _original_weights->mark_as_unused();
         }
         _is_prepared = true;
@@ -567,7 +575,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
 
 void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                             const PadStrideInfo &conv_info,
-                                            unsigned int        depth_multiplier,
+                                            unsigned int         depth_multiplier,
                                             ActivationLayerInfo act_info, const Size2D &dilation)
 {
     const GPUTarget gpu_target = CLScheduler::get().target();
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 66ac58ef95..6d63463906 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index 7138281f87..a2b883ad28 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 27acf9f7cc..c3d5f8845f 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
 
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *inp
     auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 07e7a18941..bff882c28b 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -24,19 +24,24 @@
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
+    : _direct_conv_kernel(support::cpp14::make_unique<CLDirectConvolutionLayerKernel>()), _input_border_handler(support::cpp14::make_unique<CLFillBorderKernel>()), _activationlayer_function(),
+      _is_activationlayer_enabled(false)
 {
 }
 
+CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default;
+
 void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
@@ -47,10 +52,10 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
                                          const ActivationLayerInfo &act_info)
 {
     // Set GPU target
-    _direct_conv_kernel.set_target(CLScheduler::get().target());
+    _direct_conv_kernel->set_target(CLScheduler::get().target());
 
     // Configure direct convolution
-    _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info);
+    _direct_conv_kernel->configure(compile_context, input, weights, biases, output, conv_info);
 
     // Configure border handler
     PixelValue &&zero_value(0.f);
@@ -58,10 +63,10 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     {
         zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
     }
-    _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _input_border_handler->configure(compile_context, input, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
 
     // Tune kernels
-    CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -86,10 +91,10 @@ Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITenso
 void CLDirectConvolutionLayer::run()
 {
     // Run border handler
-    CLScheduler::get().enqueue(_input_border_handler, false);
+    CLScheduler::get().enqueue(*_input_border_handler, false);
 
     // Run direct convolution
-    CLScheduler::get().enqueue(_direct_conv_kernel);
+    CLScheduler::get().enqueue(*_direct_conv_kernel);
 
     //Run Activation Layer
     if(_is_activationlayer_enabled)
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 0ffafa0221..0e3109439e 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -23,11 +23,17 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index de94255b48..35ed97d381 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 7b4d3c629d..736cf973a1 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index a1158a71a5..cc927a055b 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -28,6 +28,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cmath>
@@ -83,10 +86,17 @@ void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_di
 } // namespace
 
 CLEqualizeHistogram::CLEqualizeHistogram()
-    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+    : _histogram_kernel(support::cpp14::make_unique<CLHistogramKernel>()),
+      _border_histogram_kernel(support::cpp14::make_unique<CLHistogramBorderKernel>()),
+      _map_histogram_kernel(support::cpp14::make_unique<CLTableLookupKernel>()),
+      _hist(nr_bins, 0, max_range),
+      _cum_dist(nr_bins, 0, max_range),
+      _cd_lut(nr_bins, DataType::U8)
 {
 }
 
+CLEqualizeHistogram::~CLEqualizeHistogram() = default;
+
 void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -94,22 +104,22 @@ void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
 
 void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
 {
-    _histogram_kernel.configure(compile_context, input, &_hist);
-    _border_histogram_kernel.configure(compile_context, input, &_hist);
-    _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output);
+    _histogram_kernel->configure(compile_context, input, &_hist);
+    _border_histogram_kernel->configure(compile_context, input, &_hist);
+    _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output);
 }
 
 void CLEqualizeHistogram::run()
 {
     // Calculate histogram of input.
-    CLScheduler::get().enqueue(_histogram_kernel, false);
+    CLScheduler::get().enqueue(*_histogram_kernel, false);
 
     // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
-    CLScheduler::get().enqueue(_border_histogram_kernel, false);
+    CLScheduler::get().enqueue(*_border_histogram_kernel, false);
 
     // Calculate cumulative distribution of histogram and create LUT.
     calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
 
     // Map input to output using created LUT.
-    CLScheduler::get().enqueue(_map_histogram_kernel);
+    CLScheduler::get().enqueue(*_map_histogram_kernel);
 }
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index 5236f620f1..6880c4845a 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLErode.h"
 
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 1269cba90d..a0078689ff 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -26,15 +26,28 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(support::cpp14::make_unique<CLFFTDigitReverseKernel>()),
+      _fft_kernels(),
+      _scale_kernel(support::cpp14::make_unique<CLFFTScaleKernel>()),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _run_scale(false)
 {
 }
 
+CLFFT1D::~CLFFT1D() = default;
+
 void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
@@ -62,12 +75,12 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
-    _fft_kernels.resize(_num_ffts);
+    _fft_kernels.reserve(_num_ffts);
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -77,7 +90,8 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.emplace_back(support::cpp14::make_unique<CLFFTRadixStageKernel>());
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
@@ -88,7 +102,7 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -132,18 +146,18 @@ void CLFFT1D::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run digit reverse
-    CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+    CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
-        CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+        CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
     if(_run_scale)
     {
-        CLScheduler::get().enqueue(_scale_kernel, true);
+        CLScheduler::get().enqueue(*_scale_kernel, true);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 7ab852fa98..1d444bb15d 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
 namespace arm_compute
 {
@@ -34,6 +37,8 @@ CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLFFT2D::~CLFFT2D() = default;
+
 void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index 4d0eab81ee..5472e8469f 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -29,6 +29,13 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index 97f853fdea..110d2c3639 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -24,12 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLFastCorners.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cstring>
@@ -38,9 +40,9 @@ using namespace arm_compute;
 
 CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(),
+      _fast_corners_kernel(support::cpp14::make_unique<CLFastCornersKernel>()),
       _suppr_func(),
-      _copy_array_kernel(),
+      _copy_array_kernel(support::cpp14::make_unique<CLCopyToArrayKernel>()),
       _output(),
       _suppr(),
       _win(),
@@ -52,6 +54,8 @@ CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLFastCorners::~CLFastCorners() = default;
+
 void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
                               unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
 {
@@ -78,11 +82,11 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL
     const bool update_number = (nullptr != _num_corners);
 
     _memory_group.manage(&_output);
-    _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
+    _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
 
     if(!_non_max)
     {
-        _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer);
+        _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer);
     }
     else
     {
@@ -90,7 +94,7 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL
         _memory_group.manage(&_suppr);
 
         _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
-        _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
+        _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
 
         _suppr.allocator()->allocate();
     }
@@ -113,14 +117,14 @@ void CLFastCorners::run()
         q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
     }
 
-    CLScheduler::get().enqueue(_fast_corners_kernel, false);
+    CLScheduler::get().enqueue(*_fast_corners_kernel, false);
 
     if(_non_max)
     {
         _suppr_func.run();
     }
 
-    CLScheduler::get().enqueue(_copy_array_kernel, false);
+    CLScheduler::get().enqueue(*_copy_array_kernel, false);
 
     unsigned int get_num_corners = 0;
     q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index a89383ec31..855ed8380a 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFill.h"
 
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index c647bb6a02..27d132b842 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index a826541017..0646a0d3a0 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 7ed92ac3df..770e6a3781 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 75e87c382b..1796443ca5 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -28,6 +28,19 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 #include "support/Cast.h"
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 825267c0fc..f018e5a8ae 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,14 +28,18 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLFuseBatchNormalization::CLFuseBatchNormalization()
-    : _fuse_bn_kernel()
+    : _fuse_bn_kernel(support::cpp14::make_unique<CLFuseBatchNormalizationKernel>())
 {
 }
 
+CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
+
 void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
                                          ICLTensor *fused_weights, ICLTensor *fused_bias,
                                          const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
@@ -49,7 +53,7 @@ void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context
                                          const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
@@ -62,6 +66,6 @@ Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, cons
 
 void CLFuseBatchNormalization::run()
 {
-    CLScheduler::get().enqueue(_fuse_bn_kernel, true);
+    CLScheduler::get().enqueue(*_fuse_bn_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 80c5496ede..0151485849 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -38,6 +39,11 @@
 #include "src/core/CL/ICLGEMMKernelConfiguration.h"
 #include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
 #include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/float_ops.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
@@ -51,16 +57,58 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 using namespace arm_compute::utils::cast;
 
+namespace weights_transformations
+{
+CLGEMMReshapeRHSMatrixKernelManaged::CLGEMMReshapeRHSMatrixKernelManaged()
+    : _kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>())
+{
+}
+
+CLGEMMReshapeRHSMatrixKernelManaged::~CLGEMMReshapeRHSMatrixKernelManaged() = default;
+
+void CLGEMMReshapeRHSMatrixKernelManaged::run()
+{
+    _output.allocator()->allocate();
+    CLScheduler::get().enqueue(*_kernel, false);
+    _reshape_run = true;
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::release()
+{
+    _output.allocator()->free();
+}
+
+ICLTensor *CLGEMMReshapeRHSMatrixKernelManaged::get_weights()
+{
+    return &_output;
+}
+
+uint32_t CLGEMMReshapeRHSMatrixKernelManaged::uid()
+{
+    return _uid;
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, info);
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info)
+{
+    _kernel->configure(compile_context, input, &_output, info);
+}
+} // namespace weights_transformations
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)),
       _weights_manager(weights_manager),
-      _mm_kernel(),
-      _reshape_lhs_kernel(),
-      _reshape_rhs_kernel(),
-      _reshape_rhs_kernel_managed(),
-      _mm_reshaped_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
-      _mm_reshaped_only_rhs_fallback_kernel(),
+      _mm_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyKernel>()),
+      _reshape_lhs_kernel(support::cpp14::make_unique<CLGEMMReshapeLHSMatrixKernel>()),
+      _reshape_rhs_kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
+      _reshape_rhs_kernel_managed(support::cpp14::make_unique<weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged>()),
+      _mm_reshaped_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedKernel>()),
+      _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
+      _mm_reshaped_only_rhs_fallback_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
@@ -73,6 +121,8 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 {
 }
 
+CLGEMM::~CLGEMM() = default;
+
 CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool reshape_b_only_on_first_run)
 {
     std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
@@ -98,15 +148,15 @@ void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const
     const GPUTarget    gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     // Tune kernel statically
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    CLScheduler::get().tune_kernel_static(*_mm_kernel);
 }
 
 void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
@@ -122,8 +172,8 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons
     int                mult_interleave4x4_height = 1;
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -158,24 +208,24 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons
     }
 
     // Configure interleave kernel
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
     // Configure transpose kernel
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    CLScheduler::get().tune_kernel_static(*_mm_kernel);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -209,8 +259,8 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons
     kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
 
@@ -234,21 +284,21 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons
     // Configure lhs_info and rhs_info
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -282,7 +332,7 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
 
@@ -305,12 +355,12 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
@@ -319,11 +369,11 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Configure matrix multiply kernel with y padding support
     kernel_info.has_pad_y = true;
-    _mm_reshaped_only_rhs_fallback_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     if(!_reshape_b_only_on_first_run && use_mm_b)
     {
@@ -626,49 +676,49 @@ void CLGEMM::run()
     {
         case CLGEMMKernelType::NATIVE_V1:
         {
-            CLScheduler::get().enqueue(_mm_kernel, true);
+            CLScheduler::get().enqueue(*_mm_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED_V1:
         {
             // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+            CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
 
-            CLScheduler::get().enqueue(_mm_kernel, true);
+            CLScheduler::get().enqueue(*_mm_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+            CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
 
-            CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
+            CLScheduler::get().enqueue(*_mm_reshaped_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -678,20 +728,20 @@ void CLGEMM::run()
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
             if(_has_pad_y)
             {
-                CLScheduler::get().enqueue(_mm_reshaped_only_rhs_fallback_kernel, true);
+                CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_fallback_kernel, true);
             }
             else
             {
-                CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
+                CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, true);
             }
             break;
         }
@@ -720,13 +770,13 @@ void CLGEMM::prepare()
         {
             if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
             {
-                _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
             }
             else
             {
                 // Run transpose kernel and mark original weights tensor as unused
                 _tmp_b.allocator()->allocate();
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 _original_b->mark_as_unused();
             }
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index e871b39805..4d26df5e43 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -30,8 +30,23 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "support/Cast.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <memory>
@@ -43,10 +58,12 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::cast;
 
 CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
+    : _weights_reshape_kernel(support::cpp14::make_unique<CLWeightsReshapeKernel>())
 {
 }
 
+CLConvolutionLayerReshapeWeights::~CLConvolutionLayerReshapeWeights() = default;
+
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
@@ -64,7 +81,7 @@ void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups);
+    _weights_reshape_kernel->configure(compile_context, weights, biases_to_use, output, num_groups);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
@@ -96,16 +113,18 @@ Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co
 
 void CLConvolutionLayerReshapeWeights::run()
 {
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    CLScheduler::get().enqueue(*_weights_reshape_kernel);
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false),
-      _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(support::cpp14::make_unique<CLIm2ColKernel>()),
+      _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(support::cpp14::make_unique<CLCol2ImKernel>()), _activationlayer_function(), _original_weights(nullptr),
+      _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
 {
 }
 
+CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
+
 void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
                                           int gemm_3d_depth, const ActivationLayerInfo &act_info)
@@ -230,8 +249,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
     _fuse_activation = true;
 
     // Set the GPU target for im2col and col2im
-    _im2col_kernel.set_target(CLScheduler::get().target());
-    _col2im_kernel.set_target(CLScheduler::get().target());
+    _im2col_kernel->set_target(CLScheduler::get().target());
+    _col2im_kernel->set_target(CLScheduler::get().target());
 
     const ICLTensor *gemm_input_to_use  = input;
     ICLTensor       *gemm_output_to_use = output;
@@ -293,11 +312,11 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
         _memory_group.manage(&_im2col_output);
 
         // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
+        _im2col_kernel->configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
 
         // Set quantization info
         _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
-        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+        CLScheduler::get().tune_kernel_static(*_im2col_kernel);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -390,8 +409,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
     if(!_skip_col2im)
     {
         // Configure and tune Col2Im
-        _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
-        CLScheduler::get().tune_kernel_static(_col2im_kernel);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+        CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     if(!_skip_col2im)
@@ -611,7 +630,7 @@ void CLGEMMConvolutionLayer::run()
     // Run im2col
     if(!_skip_im2col)
     {
-        CLScheduler::get().enqueue(_im2col_kernel);
+        CLScheduler::get().enqueue(*_im2col_kernel);
     }
 
     // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
@@ -629,7 +648,7 @@ void CLGEMMConvolutionLayer::run()
     // Reshape output matrix
     if(!_skip_col2im)
     {
-        CLScheduler::get().enqueue(_col2im_kernel, false);
+        CLScheduler::get().enqueue(*_col2im_kernel.get(), false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 5fc9c17bef..4d277f0982 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -28,8 +28,23 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
-#include <memory>
 #include <tuple>
 
 namespace arm_compute
@@ -99,7 +114,7 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
       _permute_weights_to_nhwc(),
       _reshape_weights(),
       _transpose_weights(),
-      _deconv_reshape(),
+      _deconv_reshape(support::cpp14::make_unique<CLDeconvolutionReshapeOutputKernel>()),
       _slice_gemm(),
       _gemmlowp_final(),
       _reshaped_weights(),
@@ -116,6 +131,8 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
 {
 }
 
+CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
+
 Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -317,7 +334,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
     if(_is_quantized)
@@ -357,7 +374,7 @@ void CLGEMMDeconvolutionLayer::run()
         _mm_gemm.run();
     }
 
-    CLScheduler::get().enqueue(_deconv_reshape, false);
+    CLScheduler::get().enqueue(*_deconv_reshape, false);
 
     if(_is_quantized)
     {
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 7a8de6c1f5..d3d80a39e3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -35,8 +35,16 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
 #include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -71,14 +79,14 @@ inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, Dat
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _weights_to_qasymm8(),
-      _mm_native_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
-      _mtx_b_reshape_kernel(),
-      _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(),
+      _weights_to_qasymm8(support::cpp14::make_unique<CLDepthConvertLayerKernel>()),
+      _mm_native_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
+      _mtx_b_reshape_kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
+      _mtx_a_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _mtx_b_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixBReductionKernel>()),
+      _offset_contribution_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionKernel>()),
+      _offset_contribution_output_stage_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()),
       _qasymm8_weights(),
       _vector_sum_col(),
       _vector_sum_row(),
@@ -100,6 +108,8 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 {
 }
 
+CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
+
 void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
@@ -125,8 +135,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_native_kernel.set_target(gpu_target);
-    _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
+    _mm_native_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
 
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
@@ -150,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         TensorInfo weights_info(*b->info());
         weights_info.set_data_type(DataType::QASYMM8);
         _qasymm8_weights.allocator()->init(weights_info);
-        _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
     }
 
     const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
@@ -168,7 +178,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
 
     // Using default reduction info
@@ -185,7 +195,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         }
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -196,7 +206,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         _memory_group.manage(&_vector_sum_row);
 
         // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
+        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -226,8 +236,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
@@ -237,7 +247,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
             if(_is_gemm_reshaped)
             {
-                _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
             }
             else
             {
@@ -245,11 +255,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
                 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
 
-                _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
-                                                                   a->info()->dimension(0),
-                                                                   _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
+                                                                    a->info()->dimension(0),
+                                                                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
                 _mm_result_s32.allocator()->allocate();
             }
         }
@@ -270,7 +280,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
         }
         else
         {
@@ -278,12 +288,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
             std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
             // Configure matrix multiply kernel
-            _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
-                                              _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
+                                               _b_offset);
     }
 
     // Allocate tensors
@@ -489,40 +499,40 @@ void CLGEMMLowpMatrixMultiplyCore::run()
         if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
     {
-        CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+        CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false);
     }
 
     // Run matrix multiply
     if(_is_gemm_reshaped)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
+        CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false);
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_native_kernel, false);
+        CLScheduler::get().enqueue(*_mm_native_kernel, false);
     }
     if(_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
+        CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true);
     }
     if(_run_offset_contribution)
     {
         // Run offset contribution kernel
-        CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+        CLScheduler::get().enqueue(*_offset_contribution_kernel, true);
     }
 }
 
@@ -533,7 +543,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
         if(_convert_to_qasymm8)
         {
             _qasymm8_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_weights_to_qasymm8, false);
+            CLScheduler::get().enqueue(*_weights_to_qasymm8, false);
         }
 
         if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
@@ -542,7 +552,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
             _original_b->mark_as_unused();
         }
 
@@ -550,7 +560,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
         if(_a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 28f397fd8b..f9c5247d2d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -24,11 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 #include "support/MemorySupport.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index d9b6679ebf..de6296f6a3 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index c62e200315..97db9ba06d 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index 1fe2fddfb6..f7470d4ecf 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -24,22 +24,30 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
 CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+    : _memory_group(std::move(memory_manager)),
+      _kernel_hor(support::cpp14::make_unique<CLGaussian5x5HorKernel>()),
+      _kernel_vert(support::cpp14::make_unique<CLGaussian5x5VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp()
 {
 }
 
+CLGaussian5x5::~CLGaussian5x5() = default;
+
 void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
@@ -55,9 +63,9 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor
     _memory_group.manage(&_tmp);
 
     // Configure kernels
-    _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -65,10 +73,10 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor
 
 void CLGaussian5x5::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_kernel_hor, false);
-    CLScheduler::get().enqueue(_kernel_vert);
+    CLScheduler::get().enqueue(*_kernel_hor, false);
+    CLScheduler::get().enqueue(*_kernel_vert);
 }
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 297d535ba5..66b85352c1 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -24,19 +24,21 @@
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -47,6 +49,8 @@ CLGaussianPyramid::CLGaussianPyramid()
 {
 }
 
+CLGaussianPyramid::~CLGaussianPyramid() = default;
+
 CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
     : _horizontal_border_handler(),
       _vertical_border_handler(),
@@ -55,6 +59,8 @@ CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
 {
 }
 
+CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default;
+
 void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
@@ -80,10 +86,10 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler.resize(num_levels - 1);
-        _vertical_border_handler.resize(num_levels - 1);
-        _horizontal_reduction.resize(num_levels - 1);
-        _vertical_reduction.resize(num_levels - 1);
+        _horizontal_border_handler.reserve(num_levels - 1);
+        _vertical_border_handler.reserve(num_levels - 1);
+        _horizontal_reduction.reserve(num_levels - 1);
+        _vertical_reduction.reserve(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -95,16 +101,20 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            _horizontal_reduction.emplace_back(support::cpp14::make_unique<CLGaussianPyramidHorKernel>());
+            _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            _vertical_reduction.emplace_back(support::cpp14::make_unique<CLGaussianPyramidVertKernel>());
+            _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            _vertical_border_handler.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16));
         }
         _tmp.allocate();
     }
@@ -127,10 +137,10 @@ void CLGaussianPyramidHalf::run()
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
-        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
-        CLScheduler::get().enqueue(_vertical_border_handler[i], false);
-        CLScheduler::get().enqueue(_vertical_reduction[i], false);
+        CLScheduler::get().enqueue(*_horizontal_border_handler[i], false);
+        CLScheduler::get().enqueue(*_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(*_vertical_border_handler[i], false);
+        CLScheduler::get().enqueue(*_vertical_reduction[i], false);
     }
 }
 
@@ -163,7 +173,7 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC
     if(num_levels > 1)
     {
         _gauss5x5.resize(num_levels - 1);
-        _scale_nearest.resize(num_levels - 1);
+        _scale_nearest.reserve(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 
@@ -175,7 +185,8 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC
             _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
+            _scale_nearest.emplace_back(support::cpp14::make_unique<CLScaleKernel>());
+            _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
         }
 
         _tmp.allocate();
@@ -199,6 +210,6 @@ void CLGaussianPyramidOrb::run()
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _gauss5x5[i].run();
-        CLScheduler::get().enqueue(_scale_nearest[i]);
+        CLScheduler::get().enqueue(*_scale_nearest[i]);
     }
 }
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5291de074a..87bf39030a 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -25,22 +25,29 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
+      _permute_deltas_kernel(support::cpp14::make_unique<CLPermuteKernel>()),
       _flatten_deltas(),
-      _permute_scores_kernel(),
+      _permute_scores_kernel(support::cpp14::make_unique<CLPermuteKernel>()),
       _flatten_scores(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
-      _dequantize_anchors(),
-      _dequantize_deltas(),
-      _quantize_all_proposals(),
+      _compute_anchors_kernel(support::cpp14::make_unique<CLComputeAllAnchorsKernel>()),
+      _bounding_box_kernel(support::cpp14::make_unique<CLBoundingBoxTransformKernel>()),
+      _pad_kernel(support::cpp14::make_unique<CLPadLayerKernel>()),
+      _dequantize_anchors(support::cpp14::make_unique<CLDequantizationLayerKernel>()),
+      _dequantize_deltas(support::cpp14::make_unique<CLDequantizationLayerKernel>()),
+      _quantize_all_proposals(support::cpp14::make_unique<CLQuantizationLayerKernel>()),
       _cpp_nms(memory_manager),
       _is_nhwc(false),
       _is_qasymm8(false),
@@ -62,6 +69,8 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 {
 }
 
+CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
+
 void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
@@ -92,7 +101,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -102,7 +111,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas_kernel->configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -119,7 +128,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores_kernel->configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -137,18 +146,18 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
         _memory_group.manage(&_all_anchors_f32);
         _memory_group.manage(&_deltas_flattened_f32);
         // Dequantize anchors to float
-        _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32);
+        _dequantize_anchors->configure(compile_context, &_all_anchors, &_all_anchors_f32);
         _all_anchors.allocator()->allocate();
         anchors_to_use = &_all_anchors_f32;
         // Dequantize deltas to float
-        _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
+        _dequantize_deltas->configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
         _deltas_flattened.allocator()->allocate();
         deltas_to_use = &_deltas_flattened_f32;
     }
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box_kernel->configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
@@ -158,7 +167,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
         _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
-        _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized);
+        _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
     }
@@ -193,7 +202,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
 }
 
@@ -343,34 +352,34 @@ void CLGenerateProposalsLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    CLScheduler::get().enqueue(_compute_anchors_kernel, false);
+    CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
     if(!_is_nhwc)
     {
-        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
-        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+        CLScheduler::get().enqueue(*_permute_deltas_kernel, false);
+        CLScheduler::get().enqueue(*_permute_scores_kernel, false);
     }
     _flatten_deltas.run();
     _flatten_scores.run();
 
     if(_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_dequantize_anchors, false);
-        CLScheduler::get().enqueue(_dequantize_deltas, false);
+        CLScheduler::get().enqueue(*_dequantize_anchors, false);
+        CLScheduler::get().enqueue(*_dequantize_deltas, false);
     }
 
     // Build the boxes
-    CLScheduler::get().enqueue(_bounding_box_kernel, false);
+    CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
     if(_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_quantize_all_proposals, false);
+        CLScheduler::get().enqueue(*_quantize_all_proposals, false);
     }
 
     // Non maxima suppression
     run_cpp_nms_kernel();
     // Add dummy batch indexes
-    CLScheduler::get().enqueue(_pad_kernel, true);
+    CLScheduler::get().enqueue(*_pad_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 21fa6690ea..80026532ab 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -28,14 +28,26 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+    : _memory_group(std::move(memory_manager)),
+      _gradient(),
+      _orient_bin(support::cpp14::make_unique<CLHOGOrientationBinningKernel>()),
+      _block_norm(support::cpp14::make_unique<CLHOGBlockNormalizationKernel>()),
+      _mag(),
+      _phase(),
+      _hog_space()
 {
 }
 
+CLHOGDescriptor::~CLHOGDescriptor() = default;
+
 void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
@@ -87,10 +99,10 @@ void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTens
     _memory_group.manage(&_hog_space);
 
     // Initialise orientation binning kernel
-    _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
+    _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
 
     // Initialize HOG norm kernel
-    _block_norm.configure(compile_context, &_hog_space, output, hog->info());
+    _block_norm->configure(compile_context, &_hog_space, output, hog->info());
 
     // Allocate intermediate tensors
     _mag.allocator()->allocate();
@@ -106,8 +118,8 @@ void CLHOGDescriptor::run()
     _gradient.run();
 
     // Run orientation binning
-    CLScheduler::get().enqueue(_orient_bin, false);
+    CLScheduler::get().enqueue(*_orient_bin, false);
 
     // Run block normalization
-    CLScheduler::get().enqueue(_block_norm);
+    CLScheduler::get().enqueue(*_block_norm);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
index 9188f654dc..07ae8151c0 100644
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -23,19 +23,22 @@
  */
 #include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
 
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
 using namespace arm_compute;
 
 CLHOGDetector::CLHOGDetector()
-    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+    : _hog_detector_kernel(support::cpp14::make_unique<CLHOGDetectorKernel>()), _detection_windows(nullptr), _num_detection_windows()
 {
 }
 
+CLHOGDetector::~CLHOGDetector() = default;
+
 void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
@@ -50,7 +53,7 @@ void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICL
     _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
 
     // Configure HOGDetectorKernel
-    _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+    _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
 }
 
 void CLHOGDetector::run()
@@ -62,7 +65,7 @@ void CLHOGDetector::run()
     q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
 
     // Run CLHOGDetectorKernel
-    CLScheduler::get().enqueue(_hog_detector_kernel);
+    CLScheduler::get().enqueue(*_hog_detector_kernel);
 
     // Read number of detections
     unsigned int num_detection_windows = 0;
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 934d1f6351..5f3b9cf529 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -26,11 +26,18 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
+    : _memory_group(std::move(memory_manager)),
+      _derivative(),
+      _mag_phase(support::cpp14::make_unique<CLMagnitudePhaseKernel>()),
+      _gx(),
+      _gy()
 {
 }
 
@@ -63,11 +70,11 @@ void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor
     // Initialise magnitude/phase kernel
     if(PhaseType::UNSIGNED == phase_type)
     {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
     }
     else
     {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
     }
 
     // Allocate intermediate tensors
@@ -83,5 +90,5 @@ void CLHOGGradient::run()
     _derivative.run();
 
     // Run magnitude/phase kernel
-    CLScheduler::get().enqueue(_mag_phase);
+    CLScheduler::get().enqueue(*_mag_phase);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 51db43cd71..dfc90537cf 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -30,6 +30,11 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -52,6 +57,8 @@ CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_
 {
 }
 
+CLHOGMultiDetection::~CLHOGMultiDetection() = default;
+
 void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
                                     uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
 {
@@ -135,8 +142,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel.resize(_num_orient_bin_kernel);
-    _block_norm_kernel.resize(_num_block_norm_kernel);
+    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+    _block_norm_kernel.reserve(_num_block_norm_kernel);
     _hog_detect_kernel.resize(_num_hog_detect_kernel);
     _hog_space.resize(_num_orient_bin_kernel);
     _hog_norm_space.resize(_num_block_norm_kernel);
@@ -181,7 +188,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
         _memory_group.manage(&_hog_space[i]);
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel.emplace_back(support::cpp14::make_unique<CLHOGOrientationBinningKernel>());
+        _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -202,7 +210,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
         _memory_group.manage(&_hog_norm_space[i]);
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel.emplace_back(support::cpp14::make_unique<CLHOGBlockNormalizationKernel>());
+        _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -248,13 +257,13 @@ void CLHOGMultiDetection::run()
     // Run orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
+        CLScheduler::get().enqueue(*_orient_bin_kernel[i], false);
     }
 
     // Run block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        CLScheduler::get().enqueue(_block_norm_kernel[i], false);
+        CLScheduler::get().enqueue(*_block_norm_kernel[i], false);
     }
 
     // Run HOG detector kernel
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 45b93a5be0..9d8ebceb30 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -35,6 +33,10 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cmath>
@@ -45,12 +47,12 @@ using namespace arm_compute;
 CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _sobel(nullptr),
-      _harris_score(),
+      _harris_score(support::cpp14::make_unique<CLHarrisScoreKernel>()),
       _non_max_suppr(),
       _candidates(),
       _sort_euclidean(),
-      _border_gx(),
-      _border_gy(),
+      _border_gx(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _border_gy(support::cpp14::make_unique<CLFillBorderKernel>()),
       _gx(),
       _gy(),
       _score(),
@@ -61,6 +63,8 @@ CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLHarrisCorners::~CLHarrisCorners() = default;
+
 void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
                                 BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
@@ -133,11 +137,11 @@ void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImag
     _memory_group.manage(&_score);
 
     // Set/init Harris Score kernel accordingly with block_size
-    _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+    _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
 
     // Configure border filling using harris score kernel's block size
-    _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -175,11 +179,11 @@ void CLHarrisCorners::run()
     _sobel->run();
 
     // Fill border before harris score kernel
-    CLScheduler::get().enqueue(_border_gx, false);
-    CLScheduler::get().enqueue(_border_gy, false);
+    CLScheduler::get().enqueue(*_border_gx, false);
+    CLScheduler::get().enqueue(*_border_gy, false);
 
     // Run harris score kernel
-    CLScheduler::get().enqueue(_harris_score, false);
+    CLScheduler::get().enqueue(*_harris_score, false);
 
     // Run non-maxima suppression
     _non_max_suppr.run();
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 4a60ee9d08..bd680f448d 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
-
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
index 8561494242..41e47e77c7 100644
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -23,16 +23,20 @@
  */
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLIntegralImage::CLIntegralImage()
-    : _integral_hor(), _integral_vert()
+    : _integral_hor(support::cpp14::make_unique<CLIntegralImageHorKernel>()),
+      _integral_vert(support::cpp14::make_unique<CLIntegralImageVertKernel>())
 {
 }
 
+CLIntegralImage::~CLIntegralImage() = default;
+
 void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -40,12 +44,12 @@ void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    _integral_hor.configure(compile_context, input, output);
-    _integral_vert.configure(compile_context, output);
+    _integral_hor->configure(compile_context, input, output);
+    _integral_vert->configure(compile_context, output);
 }
 
 void CLIntegralImage::run()
 {
-    CLScheduler::get().enqueue(_integral_hor, false);
-    CLScheduler::get().enqueue(_integral_vert);
+    CLScheduler::get().enqueue(*_integral_hor, false);
+    CLScheduler::get().enqueue(*_integral_vert);
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 66191d1799..64aac269cd 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -24,12 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -39,10 +42,15 @@ constexpr int max_input_tensor_dim = 3;
 } // namespace
 
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+    : _memory_group(std::move(memory_manager)),
+      _reduce_func(),
+      _normalize_kernel(support::cpp14::make_unique<CLL2NormalizeLayerKernel>()),
+      _sumsq()
 {
 }
 
+CLL2NormalizeLayer::~CLL2NormalizeLayer() = default;
+
 void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
@@ -59,7 +67,7 @@ void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLT
     // Configure kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel->configure(compile_context, input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensor
     _sumsq.allocator()->allocate();
@@ -91,6 +99,6 @@ void CLL2NormalizeLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    CLScheduler::get().enqueue(_normalize_kernel, true);
+    CLScheduler::get().enqueue(*_normalize_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 058b6027c2..b095c06535 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -29,6 +29,22 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -37,20 +53,23 @@ using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(),
-      _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(),
-      _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
-      _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
-      _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(),
-      _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
-      _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
+      _transpose_cell_state(support::cpp14::make_unique<CLTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
+      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(support::cpp14::make_unique<CLCopyKernel>()), _copy_output(support::cpp14::make_unique<CLCopyKernel>()), _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), _ones_memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
+      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
+      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
+      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
+      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
+CLLSTMLayer::~CLLSTMLayer() = default;
+
 void CLLSTMLayer::configure(const ICLTensor *input,
                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
@@ -172,7 +191,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
+        _ones_memset_kernel->configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
         _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
@@ -241,7 +260,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _memory_group.manage(&_cell_state_out1);
     _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
-    _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
+    _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
     _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
@@ -367,8 +386,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     }
 
     // Copy cell state and output
-    _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out);
-    _copy_output.configure(compile_context, output_state_out, output);
+    _copy_cell_state->configure(compile_context, &_cell_state_out1, cell_state_out);
+    _copy_output->configure(compile_context, output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
@@ -642,7 +661,7 @@ void CLLSTMLayer::run()
 
     if(_run_cifg_opt)
     {
-        CLScheduler::get().enqueue(_ones_memset_kernel);
+        CLScheduler::get().enqueue(*_ones_memset_kernel);
         _subtract_input_gate.run();
     }
     else
@@ -665,7 +684,7 @@ void CLLSTMLayer::run()
     }
 
     _fully_connected_cell_state.run();
-    CLScheduler::get().enqueue(_transpose_cell_state);
+    CLScheduler::get().enqueue(*_transpose_cell_state);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
@@ -711,8 +730,8 @@ void CLLSTMLayer::run()
         }
     }
 
-    CLScheduler::get().enqueue(_copy_cell_state);
-    CLScheduler::get().enqueue(_copy_output);
+    CLScheduler::get().enqueue(*_copy_cell_state);
+    CLScheduler::get().enqueue(*_copy_output);
 
     _concat_scratch_buffer.run();
 }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index 76a531b1c9..46062387e7 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -27,6 +27,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 81e903cde8..1ad19e56ea 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -32,6 +32,9 @@
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 
 using namespace arm_compute;
 
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index cbb952c3f6..d7fd81754b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 74cb47347f..04e59ac4a6 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -27,6 +27,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <tuple>
@@ -78,8 +83,16 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
 } // namespace
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_prepared(false), _original_weights(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _input_im2col_kernel(support::cpp14::make_unique<CLIm2ColKernel>()),
+      _weights_reshape_kernel(support::cpp14::make_unique<CLWeightsReshapeKernel>()),
+      _mm_kernel(support::cpp14::make_unique<CLLocallyConnectedMatrixMultiplyKernel>()),
+      _output_col2im_kernel(support::cpp14::make_unique<CLCol2ImKernel>()),
+      _input_im2col_reshaped(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _is_prepared(false),
+      _original_weights(nullptr)
 {
 }
 
@@ -169,16 +182,16 @@ void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context,
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
+    _input_im2col_kernel->configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped);
+    _mm_kernel->configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel->configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    CLScheduler::get().tune_kernel_static(_input_im2col_kernel);
+    CLScheduler::get().tune_kernel_static(*_input_im2col_kernel);
 }
 
 void CLLocallyConnectedLayer::run()
@@ -188,13 +201,13 @@ void CLLocallyConnectedLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
-    CLScheduler::get().enqueue(_input_im2col_kernel);
+    CLScheduler::get().enqueue(*_input_im2col_kernel);
 
     // Runs vector matrix multiply on reshaped matrices
-    CLScheduler::get().enqueue(_mm_kernel);
+    CLScheduler::get().enqueue(*_mm_kernel);
 
     // Reshape output matrix
-    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+    CLScheduler::get().enqueue(*_output_col2im_kernel.get(), false);
 }
 
 void CLLocallyConnectedLayer::prepare()
@@ -205,7 +218,7 @@ void CLLocallyConnectedLayer::prepare()
 
         // Run weights reshaping and mark original weights tensor as unused
         _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(*_weights_reshape_kernel);
         _original_weights->mark_as_unused();
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 962adadbb2..fb3ebdaa96 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 3e32c55067..392bff2b4e 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -24,18 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _memset_kernel(), _unpooling_layer_kernel()
+    : _memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _unpooling_layer_kernel(support::cpp14::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
+CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
+
 void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
@@ -44,9 +49,9 @@ void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTen
 void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     const PixelValue zero_value(0.f);
-    _memset_kernel.configure(output, zero_value);
+    _memset_kernel->configure(output, zero_value);
 
-    _unpooling_layer_kernel.configure(compile_context, input, indices, output, pool_info);
+    _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
 Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -57,9 +62,9 @@ Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo
 void CLMaxUnpoolingLayer::run()
 {
     // Run memset
-    CLScheduler::get().enqueue(_memset_kernel, false);
+    CLScheduler::get().enqueue(*_memset_kernel, false);
 
     // Run max unpooling layer
-    CLScheduler::get().enqueue(_unpooling_layer_kernel);
+    CLScheduler::get().enqueue(*_unpooling_layer_kernel);
 }
 } /* namespace arm_compute */
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 2517fdc4ef..c91bc954b8 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -25,6 +25,10 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -39,13 +43,15 @@ CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NO
       _reduction_output_stddev(),
       _mean(nullptr),
       _stddev(nullptr),
-      _mean_stddev_kernel(),
-      _fill_border_kernel(),
+      _mean_stddev_kernel(support::cpp14::make_unique<CLMeanStdDevKernel>()),
+      _fill_border_kernel(support::cpp14::make_unique<CLFillBorderKernel>()),
       _global_sum(),
       _global_sum_squared()
 {
 }
 
+CLMeanStdDev::~CLMeanStdDev() = default;
+
 Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
@@ -101,8 +107,8 @@ void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *
             _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
         }
 
-        _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+        _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
+        _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
     }
 }
 
@@ -149,8 +155,8 @@ void CLMeanStdDev::run_float()
 
 void CLMeanStdDev::run_int()
 {
-    CLScheduler::get().enqueue(_fill_border_kernel);
-    CLScheduler::get().enqueue(_mean_stddev_kernel);
+    CLScheduler::get().enqueue(*_fill_border_kernel);
+    CLScheduler::get().enqueue(*_mean_stddev_kernel);
 }
 
 void CLMeanStdDev::run()
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 07ab669fde..5b5ff49ecb 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 92153128f9..2040ebd4f5 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *
     auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index a27defe2f7..3ddd4d04ed 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLMinMaxLocation::CLMinMaxLocation()
-    : _min_max_kernel(),
-      _min_max_loc_kernel(),
+    : _min_max_kernel(support::cpp14::make_unique<CLMinMaxKernel>()),
+      _min_max_loc_kernel(support::cpp14::make_unique<CLMinMaxLocationKernel>()),
       _min_max_vals(),
       _min_max_count_vals(),
       _min(nullptr),
@@ -41,6 +42,8 @@ CLMinMaxLocation::CLMinMaxLocation()
 {
 }
 
+CLMinMaxLocation::~CLMinMaxLocation() = default;
+
 void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
@@ -62,16 +65,16 @@ void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const
     _min_loc            = min_loc;
     _max_loc            = max_loc;
 
-    _min_max_kernel.configure(compile_context, input, &_min_max_vals);
-    _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+    _min_max_kernel->configure(compile_context, input, &_min_max_vals);
+    _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
 }
 
 void CLMinMaxLocation::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
+    CLScheduler::get().enqueue(*_min_max_kernel, false);
+    CLScheduler::get().enqueue(*_min_max_loc_kernel, false);
 
     // Update min and max
     q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index 71f08e8072..3312f6f9a7 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
 
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTe
     auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
     k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index a79bb0c5a3..22ca176a71 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -43,10 +44,10 @@ void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_contex
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
     }
     else
     {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
     }
 }
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 4be6257bbf..40a6cdd2f4 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -25,18 +25,25 @@
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLNormalizationLayer::CLNormalizationLayer()
-    : _norm_kernel(), _border_handler()
+    : _norm_kernel(support::cpp14::make_unique<CLNormalizationLayerKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
+CLNormalizationLayer::~CLNormalizationLayer() = default;
+
 void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
@@ -47,10 +54,10 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
     // Configure normalization kernel
-    _norm_kernel.configure(compile_context, input, output, norm_info);
+    _norm_kernel->configure(compile_context, input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
@@ -61,8 +68,8 @@ Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
 void CLNormalizationLayer::run()
 {
     // Run border handler
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     // Run normalization kernel
-    CLScheduler::get().enqueue(_norm_kernel);
+    CLScheduler::get().enqueue(*_norm_kernel);
 }
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 806e6489a2..9576486db0 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,7 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 0b5547eaab..fca6192296 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
@@ -33,6 +32,8 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -42,7 +43,7 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) //
       _tracker_init_kernel(),
       _tracker_stage0_kernel(),
       _tracker_stage1_kernel(),
-      _tracker_finalize_kernel(),
+      _tracker_finalize_kernel(support::cpp14::make_unique<CLLKTrackerFinalizeKernel>()),
       _func_scharr(),
       _scharr_gx(),
       _scharr_gy(),
@@ -57,6 +58,8 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) //
 {
 }
 
+CLOpticalFlow::~CLOpticalFlow() = default;
+
 void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
                               const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
                               Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
@@ -93,9 +96,9 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel.resize(_num_levels);
-    _tracker_stage0_kernel.resize(_num_levels);
-    _tracker_stage1_kernel.resize(_num_levels);
+    _tracker_init_kernel.reserve(_num_levels);
+    _tracker_stage0_kernel.reserve(_num_levels);
+    _tracker_stage1_kernel.reserve(_num_levels);
     _func_scharr.resize(_num_levels);
     _scharr_gx.resize(_num_levels);
     _scharr_gy.resize(_num_levels);
@@ -134,16 +137,19 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
         _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
         // Init Lucas-Kanade init kernel
-        _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+        _tracker_init_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerInitKernel>());
+        _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
 
         // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            window_dimension, i);
+        _tracker_stage0_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerStage0Kernel>());
+        _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                                 _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                                 window_dimension, i);
 
         // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            termination, epsilon, num_iterations, window_dimension, i);
+        _tracker_stage1_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerStage1Kernel>());
+        _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                                 termination, epsilon, num_iterations, window_dimension, i);
 
         // Allocate intermediate buffers
         _scharr_gx[i].allocator()->allocate();
@@ -151,7 +157,7 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
     }
 
     // Finalize Lucas-Kanade
-    _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points);
+    _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points);
 }
 
 void CLOpticalFlow::run()
@@ -166,14 +172,14 @@ void CLOpticalFlow::run()
         _func_scharr[level - 1].run();
 
         // Run Lucas-Kanade init kernel
-        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]);
 
         // Run Lucas-Kanade stage0 kernel
-        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]);
 
         // Run Lucas-Kanade stage1 kernel
-        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]);
     }
 
-    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+    CLScheduler::get().enqueue(*_tracker_finalize_kernel, true);
 }
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index aaddd46071..60cf4d1a2d 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index fb6078cc79..388b07b76e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,14 +22,21 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _pad_kernel(), _copy_kernel(), _perform_pad(false)
+    : _pad_kernel(support::cpp14::make_unique<CLPadLayerKernel>()),
+      _copy_kernel(support::cpp14::make_unique<CLCopyKernel>()),
+      _perform_pad(false)
 {
 }
 
+CLPadLayer::~CLPadLayer() = default;
+
 void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
@@ -46,12 +53,12 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
 
     if(_perform_pad)
     {
-        _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode);
+        _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(compile_context, input, output);
+        _copy_kernel->configure(compile_context, input, output);
     }
 }
 Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
@@ -75,11 +82,11 @@ void CLPadLayer::run()
 {
     if(_perform_pad)
     {
-        CLScheduler::get().enqueue(_pad_kernel);
+        CLScheduler::get().enqueue(*_pad_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_copy_kernel);
+        CLScheduler::get().enqueue(*_copy_kernel);
     }
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index e13046bd46..f7f0bc4f5d 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/Error.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index 64d2e0fdff..6594cd5bac 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPhase.h"
 
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 883ce68536..12cc5d60af 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -55,7 +56,7 @@ ITensorPack select_border_input(ITensorPack &tensors)
 namespace experimental
 {
 CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _border_handler()
+    : _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
@@ -72,7 +73,7 @@ void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_contex
 
         if(broadcasted_info->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -86,12 +87,12 @@ Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen
 void CLPixelWiseMultiplication::run(ITensorPack &tensors)
 {
     auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    CLScheduler::get().enqueue_op(*_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _border_handler()
+    : _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
@@ -107,7 +108,7 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
 
         if(broadcasted_info->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -120,7 +121,7 @@ Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con
 void CLComplexPixelWiseMultiplication::run(ITensorPack &tensors)
 {
     auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    CLScheduler::get().enqueue_op(*_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 } // namespace experimental
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index a14818fffe..7f99aee9ba 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -79,7 +80,7 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso
         default:
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
 
     // Tune kernels
     CLScheduler::get().tune_kernel_static(*_kernel);
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index fefbff639d..8cb971793e 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -24,13 +24,13 @@
 
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 2d21d210e4..54df5a0a5e 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -30,7 +30,18 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -86,10 +97,50 @@ void CLQLSTMLayer::TensorCopyKernel::run()
 }
 
 CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _input_to_input_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_input_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_forget_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_forget_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_cell_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_cell_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_output_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_output_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _projection_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _layer_norms(),
+      _copy_output(support::cpp14::make_unique<CLCopyKernel>())
 {
+    for(auto &norm : _layer_norms)
+    {
+        norm = support::cpp14::make_unique<CLQLSTMLayerNormalizationKernel>();
+    }
+
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
+CLQLSTMLayer::~CLQLSTMLayer() = default;
+
+void CLQLSTMLayer::configure_layer_norm(LayerNormGate g, const ICLTensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    CLTensor *out = &get_layer_norm_output(g);
+    _memory_group.manage(out);
+    out->allocator()->init(*(in->info()));
+
+    get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out
+    {
+        in
+    };
+    return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
 void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
                                 const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
                                 CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
@@ -200,18 +251,18 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -543,7 +594,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     }
 
     // Copy output_state_out to output
-    _copy_output.configure(compile_context, output_state_out, output);
+    _copy_output->configure(compile_context, output_state_out, output);
 }
 
 Status CLQLSTMLayer::validate(const ITensorInfo *input,
@@ -1049,7 +1100,7 @@ void CLQLSTMLayer::run()
     }
 
     // Copy output_state_out to output
-    CLScheduler::get().enqueue(_copy_output);
+    CLScheduler::get().enqueue(*_copy_output);
 }
 
 void CLQLSTMLayer::prepare()
@@ -1081,8 +1132,8 @@ void CLQLSTMLayer::prepare()
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(_input_to_input_reduction);
-            CLScheduler::get().enqueue(_recurrent_to_input_reduction);
+            CLScheduler::get().enqueue(*_input_to_input_reduction);
+            CLScheduler::get().enqueue(*_recurrent_to_input_reduction);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1097,17 +1148,17 @@ void CLQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        CLScheduler::get().enqueue(_input_to_forget_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_forget_reduction);
-        CLScheduler::get().enqueue(_input_to_cell_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_cell_reduction);
-        CLScheduler::get().enqueue(_input_to_output_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_output_reduction);
+        CLScheduler::get().enqueue(*_input_to_forget_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_forget_reduction);
+        CLScheduler::get().enqueue(*_input_to_cell_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_cell_reduction);
+        CLScheduler::get().enqueue(*_input_to_output_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_output_reduction);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(_projection_reduction);
+            CLScheduler::get().enqueue(*_projection_reduction);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index f0a446acab..f132547eb9 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 94e7f9440c..be3e539f98 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,17 +28,33 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
-      _is_prepared(false)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(support::cpp14::make_unique<CLCopyKernel>()), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
 {
 }
 
+CLRNNLayer::~CLRNNLayer() = default;
+
 Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
@@ -107,7 +123,7 @@ void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTen
     _activation.configure(compile_context, &_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(compile_context, hidden_state, output);
+    _copy_kernel->configure(compile_context, hidden_state, output);
 }
 
 void CLRNNLayer::run()
@@ -122,7 +138,7 @@ void CLRNNLayer::run()
     _activation.run();
 
     // copy hidden out to output
-    CLScheduler::get().enqueue(_copy_kernel);
+    CLScheduler::get().enqueue(*_copy_kernel);
 }
 
 void CLRNNLayer::prepare()
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 2337cee33f..cf28a1a0fb 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,7 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index cdf60ce04f..b0e6716cce 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,10 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
-
 #include "arm_compute/core/CL/ICLArray.h"
-
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 8bf2a0c43e..57b57bd305 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 4ea7f7642f..b761dc2f99 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 208371c45d..7423f4bc87 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -30,9 +30,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
-
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -43,6 +44,8 @@ CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memor
 {
 }
 
+CLReductionOperation::~CLReductionOperation() = default;
+
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -211,7 +214,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
     }
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+    _reduction_kernels_vector.reserve(_num_of_stages);
 
     // Create temporary tensors
     if(_is_serial)
@@ -221,11 +224,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[0]->configure(compile_context, input, output_internal, axis, op, 0);
     }
     else
     {
-        _border_handlers_vector.resize(_num_of_stages);
+        _border_handlers_vector.reserve(_num_of_stages);
         _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
@@ -269,15 +273,23 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
-        _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[0]->configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
+
+        _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+        _border_handlers_vector[0]->configure(compile_context, input, _reduction_kernels_vector[0]->border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+
+            _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+            _reduction_kernels_vector[i]->configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+
+            _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _border_handlers_vector[i]->configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i]->border_size(), BorderMode::CONSTANT, pixelValue);
+
             _results_vector[i - 1].allocator()->allocate();
         }
 
@@ -290,8 +302,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
+
+        _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+        _border_handlers_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage]->border_size(), BorderMode::CONSTANT, pixelValue);
+
         _results_vector[last_stage - 1].allocator()->allocate();
     }
 
@@ -308,14 +324,14 @@ void CLReductionOperation::run()
 
     if(_is_serial)
     {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+        CLScheduler::get().enqueue(*_reduction_kernels_vector[0], false);
     }
     else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
-            CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-            CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+            CLScheduler::get().enqueue(*_border_handlers_vector[i], false);
+            CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
         }
     }
 
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index 1e3d614402..6466c2843b 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/functions/CLRemap.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -53,5 +54,5 @@ void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
     k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 1dc41aefb5..4b2f70334f 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 273a761a0a..5112064b23 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "support/MemorySupport.h"
 
 /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 213fbc8f32..b73d8de62e 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index e111c6d1f7..383b0cc305 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -24,10 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -60,7 +61,7 @@ void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     {
         border_mode_to_use = BorderMode::CONSTANT;
     }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value);
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value);
 }
 
 void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index b121ee7b99..e5d0d2d630 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *
     auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index ef8010847b..374da91b78 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index f36550ba91..940540563a 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 566a4a1534..78376f935a 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -23,14 +23,17 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
+CLSobel3x3::~CLSobel3x3() = default;
+
 void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -41,5 +44,5 @@ void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *i
     auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index f70e4f36f5..fa5d8945fb 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -24,20 +24,29 @@
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+    : _memory_group(std::move(memory_manager)),
+      _sobel_hor(support::cpp14::make_unique<CLSobel5x5HorKernel>()),
+      _sobel_vert(support::cpp14::make_unique<CLSobel5x5VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp_x(),
+      _tmp_y()
 {
 }
 
+CLSobel5x5::~CLSobel5x5() = default;
+
 void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -58,8 +67,8 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -67,27 +76,27 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel5x5::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
+    CLScheduler::get().enqueue(*_sobel_hor, false);
+    CLScheduler::get().enqueue(*_sobel_vert);
 }
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 792432e841..f462adb0ed 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -24,20 +24,29 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+    : _memory_group(std::move(memory_manager)),
+      _sobel_hor(support::cpp14::make_unique<CLSobel7x7HorKernel>()),
+      _sobel_vert(support::cpp14::make_unique<CLSobel7x7VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp_x(),
+      _tmp_y()
 {
 }
 
+CLSobel7x7::~CLSobel7x7() = default;
+
 void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -58,8 +67,8 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -67,27 +76,27 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel7x7::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
+    CLScheduler::get().enqueue(*_sobel_hor, false);
+    CLScheduler::get().enqueue(*_sobel_vert);
 }
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 759c8706a1..4caf91488e 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -24,24 +24,38 @@
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 template <bool IS_LOG>
 CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _input_permuted(), _output_permuted(),
+    : _memory_group(std::move(memory_manager)),
+      _permute_input(),
+      _permute_output(),
+      _max_shift_exp_sum_kernel(support::cpp14::make_unique<CLLogits1DMaxShiftExpSumKernel>()),
+      _norm_kernel(support::cpp14::make_unique<CLLogits1DNormKernel>()),
+      _max(),
+      _sum(),
+      _tmp(),
+      _input_permuted(),
+      _output_permuted(),
       _needs_permute()
 {
 }
 
+template <bool IS_LOG>
+CLSoftmaxLayerGeneric<IS_LOG>::~CLSoftmaxLayerGeneric() = default;
+
 template <bool IS_LOG>
 void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
@@ -78,7 +92,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     _sum.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
 
     // Set GPU target to kernels
-    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
+    _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
 
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
@@ -91,8 +105,8 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     softmax_info.input_data_type = tmp_input->info()->data_type();
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info);
-    _norm_kernel.configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info);
+    _max_shift_exp_sum_kernel->configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info);
+    _norm_kernel->configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info);
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -156,8 +170,8 @@ void           CLSoftmaxLayerGeneric<IS_LOG>::run()
         _permute_input.run();
     }
 
-    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel, !_needs_permute);
+    CLScheduler::get().enqueue(*_max_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(*_norm_kernel, !_needs_permute);
 
     if(_needs_permute)
     {
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index eea3cb535f..e83def5677 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,14 +29,21 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+    : _space_to_batch_kernel(support::cpp14::make_unique<CLSpaceToBatchLayerKernel>()),
+      _memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _has_padding(false)
 {
 }
 
+CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
+
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
@@ -49,9 +56,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
@@ -67,9 +74,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
@@ -94,8 +101,8 @@ void CLSpaceToBatchLayer::run()
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        CLScheduler::get().enqueue(_memset_kernel, true);
+        CLScheduler::get().enqueue(*_memset_kernel, true);
     }
-    CLScheduler::get().enqueue(_space_to_batch_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_batch_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 06aa92d6fa..db8c4953cc 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,18 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel()
+    : _space_to_depth_kernel(support::cpp14::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
+CLSpaceToDepthLayer::~CLSpaceToDepthLayer() = default;
+
 void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
@@ -44,7 +48,7 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
 
 void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
-    _space_to_depth_kernel.configure(compile_context, input, output, block_shape);
+    _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
 }
 
 Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -54,6 +58,6 @@ Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void CLSpaceToDepthLayer::run()
 {
-    CLScheduler::get().enqueue(_space_to_depth_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_depth_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 39f0ab4779..f4aa78a72d 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -42,6 +44,8 @@ CLStackLayer::CLStackLayer() // NOLINT
 {
 }
 
+CLStackLayer::~CLStackLayer() = default;
+
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
@@ -50,14 +54,15 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
 void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
+    _stack_kernels.reserve(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels.emplace_back(support::cpp14::make_unique<CLStackLayerKernel>());
+        _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
     }
 }
 
@@ -87,7 +92,7 @@ void CLStackLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        CLScheduler::get().enqueue(_stack_kernels[i], false);
+        CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index b78073dd67..3f6814f5ce 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 3d2d1853ca..8282f37e4b 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
 
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index bdbf37e841..250f6f034f 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLThreshold.h"
 
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index 68efad0125..8384e48baf 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index 8cade66a90..43fa7a012a 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
index e9456c100b..10b4b76a5e 100644
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp
@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLUpsampleLayer::CLUpsampleLayer() // NOLINT
-    : _upsample(),
+    : _upsample(support::cpp14::make_unique<CLUpsampleLayerKernel>()),
       _output(nullptr)
 {
 }
 
+CLUpsampleLayer::~CLUpsampleLayer() = default;
+
 Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
                                  const Size2D &info, const InterpolationPolicy upsampling_policy)
 {
@@ -53,11 +57,11 @@ void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTens
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _upsample.configure(compile_context, input, _output, info, upsampling_policy);
+    _upsample->configure(compile_context, input, _output, info, upsampling_policy);
 }
 
 void CLUpsampleLayer::run()
 {
-    CLScheduler::get().enqueue(_upsample, false);
+    CLScheduler::get().enqueue(*_upsample, false);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index fffc58c8d0..86e5a7bd86 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
     k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index 2b4b187e38..7e8bc5cdff 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTe
     auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
     k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 7ad017f918..7af42904e8 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -28,6 +28,15 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -90,11 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
 } // namespace
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
-      _is_prepared(false)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(support::cpp14::make_unique<CLWinogradFilterTransformKernel>()),
+      _output_transform(support::cpp14::make_unique<CLWinogradOutputTransformKernel>()), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr), _is_prepared(false)
 {
 }
 
+CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
+
 void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
                                            bool enable_fast_math)
 {
@@ -139,7 +150,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
     _input_transform.configure(compile_context, input, &_input0, winograd_info);
 
     // Configure filter transform
-    _filter_transform.configure(compile_context, weights, &_input1, winograd_info);
+    _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
     _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
@@ -147,7 +158,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
                                                                                                                   (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
+    _output_transform->configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
@@ -218,7 +229,7 @@ void CLWinogradConvolutionLayer::run()
     _batched_mm.run();
 
     // Run output transform
-    CLScheduler::get().enqueue(_output_transform);
+    CLScheduler::get().enqueue(*_output_transform);
 }
 
 void CLWinogradConvolutionLayer::prepare()
@@ -227,7 +238,7 @@ void CLWinogradConvolutionLayer::prepare()
     {
         // Run filter transform and mark original weights as unused
         _input1.allocator()->allocate();
-        CLScheduler::get().enqueue(_filter_transform, false);
+        CLScheduler::get().enqueue(*_filter_transform, false);
         _original_weights->mark_as_unused();
 
         // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 9498206549..308c41f714 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "arm_compute/core/Error.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -40,7 +41,7 @@ void CLWinogradInputTransform::configure(const CLCompileContext &compile_context
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
     k->configure(compile_context, input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
index d553f97009..46bf220b0c 100644
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
-- 
cgit v1.2.1