From 2b84be544e4a27f7e8e80827e9c85c8f0d58b4ce Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Wed, 8 Apr 2020 10:15:51 +0100
Subject: COMPMID-3280: Make all ML primitives for CL use the new interface -
 Part 2

- CLFunctions have been updated

Change-Id: Ie3256a6c775bc12f3126482bd8e8a46da54b267c
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3053
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/runtime/CL/functions/CLAbsoluteDifference.cpp  |   7 +-
 src/runtime/CL/functions/CLAccumulate.cpp          |  21 +-
 src/runtime/CL/functions/CLActivationLayer.cpp     |   7 +-
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp      |  17 +-
 .../CL/functions/CLBatchNormalizationLayer.cpp     |  11 +-
 src/runtime/CL/functions/CLBatchToSpaceLayer.cpp   |  16 +-
 src/runtime/CL/functions/CLBitwiseAnd.cpp          |   7 +-
 src/runtime/CL/functions/CLBitwiseNot.cpp          |   7 +-
 src/runtime/CL/functions/CLBitwiseOr.cpp           |   7 +-
 src/runtime/CL/functions/CLBitwiseXor.cpp          |   7 +-
 .../CL/functions/CLBoundingBoxTransform.cpp        |   7 +-
 src/runtime/CL/functions/CLBox3x3.cpp              |   9 +-
 src/runtime/CL/functions/CLCannyEdge.cpp           |  21 +-
 src/runtime/CL/functions/CLCast.cpp                |   7 +-
 src/runtime/CL/functions/CLChannelCombine.cpp      |  14 +-
 src/runtime/CL/functions/CLChannelExtract.cpp      |  14 +-
 src/runtime/CL/functions/CLChannelShuffleLayer.cpp |   7 +-
 src/runtime/CL/functions/CLColorConvert.cpp        |  28 +-
 src/runtime/CL/functions/CLComparison.cpp          |  19 +-
 src/runtime/CL/functions/CLComputeAllAnchors.cpp   |   7 +-
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |  28 +-
 .../functions/CLConvertFullyConnectedWeights.cpp   |   8 +-
 src/runtime/CL/functions/CLConvolution.cpp         |  37 ++-
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |  17 +-
 src/runtime/CL/functions/CLCopy.cpp                |   7 +-
 src/runtime/CL/functions/CLCropResize.cpp          | 282 +++++++++++----------
 src/runtime/CL/functions/CLDeconvolutionLayer.cpp  |  12 +-
 .../CL/functions/CLDeconvolutionLayerUpsample.cpp  |  11 +-
 src/runtime/CL/functions/CLDepthConvertLayer.cpp   |   7 +-
 src/runtime/CL/functions/CLDepthToSpaceLayer.cpp   |   7 +-
 .../CL/functions/CLDepthwiseConvolutionLayer.cpp   |  55 +++-
 src/runtime/CL/functions/CLDequantizationLayer.cpp |   7 +-
 src/runtime/CL/functions/CLDerivative.cpp          |   9 +-
 src/runtime/CL/functions/CLDilate.cpp              |   9 +-
 .../CL/functions/CLDirectConvolutionLayer.cpp      |  13 +-
 .../CL/functions/CLDirectDeconvolutionLayer.cpp    |  30 ++-
 .../CL/functions/CLElementWiseUnaryLayer.cpp       |  49 +++-
 .../CL/functions/CLElementwiseOperations.cpp       |  67 +++--
 src/runtime/CL/functions/CLEqualizeHistogram.cpp   |  13 +-
 src/runtime/CL/functions/CLErode.cpp               |   9 +-
 src/runtime/CL/functions/CLFFT1D.cpp               |  13 +-
 src/runtime/CL/functions/CLFFT2D.cpp               |   9 +-
 src/runtime/CL/functions/CLFFTConvolutionLayer.cpp |  38 +--
 src/runtime/CL/functions/CLFastCorners.cpp         |  16 +-
 src/runtime/CL/functions/CLFill.cpp                |  10 +-
 src/runtime/CL/functions/CLFillBorder.cpp          |   7 +-
 src/runtime/CL/functions/CLFlattenLayer.cpp        |   7 +-
 src/runtime/CL/functions/CLFloor.cpp               |   2 +-
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |  44 ++--
 .../CL/functions/CLFuseBatchNormalization.cpp      |  12 +-
 src/runtime/CL/functions/CLGEMM.cpp                |  49 ++--
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |  37 ++-
 .../CL/functions/CLGEMMDeconvolutionLayer.cpp      |  70 ++---
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  |  30 ++-
 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp |  77 +++++-
 src/runtime/CL/functions/CLGather.cpp              |   7 +-
 src/runtime/CL/functions/CLGaussian3x3.cpp         |   9 +-
 src/runtime/CL/functions/CLGaussian5x5.cpp         |  13 +-
 src/runtime/CL/functions/CLGaussianPyramid.cpp     |  22 +-
 .../CL/functions/CLGenerateProposalsLayer.cpp      |  31 ++-
 src/runtime/CL/functions/CLHOGDescriptor.cpp       |  13 +-
 src/runtime/CL/functions/CLHOGDetector.cpp         |  10 +-
 src/runtime/CL/functions/CLHOGGradient.cpp         |  14 +-
 src/runtime/CL/functions/CLHOGMultiDetection.cpp   |  16 +-
 src/runtime/CL/functions/CLHarrisCorners.cpp       |  21 +-
 src/runtime/CL/functions/CLHistogram.cpp           |  11 +-
 .../CL/functions/CLInstanceNormalizationLayer.cpp  |   7 +-
 src/runtime/CL/functions/CLIntegralImage.cpp       |  11 +-
 src/runtime/CL/functions/CLL2NormalizeLayer.cpp    |   9 +-
 src/runtime/CL/functions/CLLSTMLayer.cpp           | 109 ++++----
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  |  68 +++--
 src/runtime/CL/functions/CLLaplacianPyramid.cpp    |  13 +-
 .../CL/functions/CLLaplacianReconstruct.cpp        |  13 +-
 .../CL/functions/CLLocallyConnectedLayer.cpp       |  16 +-
 src/runtime/CL/functions/CLMagnitude.cpp           |   7 +-
 src/runtime/CL/functions/CLMeanStdDev.cpp          |  15 +-
 .../functions/CLMeanStdDevNormalizationLayer.cpp   |   7 +-
 src/runtime/CL/functions/CLMedian3x3.cpp           |   9 +-
 src/runtime/CL/functions/CLMinMaxLocation.cpp      |  13 +-
 src/runtime/CL/functions/CLNonLinearFilter.cpp     |  10 +-
 .../CL/functions/CLNonMaximaSuppression3x3.cpp     |  11 +-
 src/runtime/CL/functions/CLNormalizationLayer.cpp  |  11 +-
 .../CL/functions/CLNormalizePlanarYUVLayer.cpp     |   7 +-
 src/runtime/CL/functions/CLOpticalFlow.cpp         |  19 +-
 src/runtime/CL/functions/CLPReluLayer.cpp          |  13 +-
 src/runtime/CL/functions/CLPadLayer.cpp            |  11 +-
 src/runtime/CL/functions/CLPermute.cpp             |   7 +-
 src/runtime/CL/functions/CLPhase.cpp               |   7 +-
 .../CL/functions/CLPixelWiseMultiplication.cpp     |  19 +-
 src/runtime/CL/functions/CLPoolingLayer.cpp        |   9 +-
 src/runtime/CL/functions/CLPriorBoxLayer.cpp       |   9 +-
 src/runtime/CL/functions/CLQLSTMLayer.cpp          | 132 ++++++----
 src/runtime/CL/functions/CLQuantizationLayer.cpp   |   7 +-
 src/runtime/CL/functions/CLRNNLayer.cpp            |  17 +-
 src/runtime/CL/functions/CLROIAlignLayer.cpp       |   7 +-
 src/runtime/CL/functions/CLROIPoolingLayer.cpp     |   7 +-
 src/runtime/CL/functions/CLRange.cpp               |   7 +-
 src/runtime/CL/functions/CLReduceMean.cpp          |  11 +-
 src/runtime/CL/functions/CLReductionOperation.cpp  |  21 +-
 src/runtime/CL/functions/CLRemap.cpp               |  11 +-
 src/runtime/CL/functions/CLReorgLayer.cpp          |   7 +-
 src/runtime/CL/functions/CLReshapeLayer.cpp        |   7 +-
 src/runtime/CL/functions/CLReverse.cpp             |   7 +-
 src/runtime/CL/functions/CLScale.cpp               |  10 +-
 src/runtime/CL/functions/CLScharr3x3.cpp           |   9 +-
 src/runtime/CL/functions/CLSelect.cpp              |   9 +-
 src/runtime/CL/functions/CLSlice.cpp               |   7 +-
 src/runtime/CL/functions/CLSobel3x3.cpp            |   9 +-
 src/runtime/CL/functions/CLSobel5x5.cpp            |  21 +-
 src/runtime/CL/functions/CLSobel7x7.cpp            |  21 +-
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |  28 +-
 src/runtime/CL/functions/CLSpaceToBatchLayer.cpp   |  21 +-
 src/runtime/CL/functions/CLSpaceToDepthLayer.cpp   |   9 +-
 src/runtime/CL/functions/CLStackLayer.cpp          |   7 +-
 src/runtime/CL/functions/CLStridedSlice.cpp        |   9 +-
 src/runtime/CL/functions/CLTableLookup.cpp         |   7 +-
 src/runtime/CL/functions/CLThreshold.cpp           |   8 +-
 src/runtime/CL/functions/CLTile.cpp                |   7 +-
 src/runtime/CL/functions/CLTranspose.cpp           |   7 +-
 src/runtime/CL/functions/CLUnstack.cpp             |   9 +-
 src/runtime/CL/functions/CLUpsampleLayer.cpp       |  10 +-
 src/runtime/CL/functions/CLWarpAffine.cpp          |  10 +-
 src/runtime/CL/functions/CLWarpPerspective.cpp     |  10 +-
 .../CL/functions/CLWinogradConvolutionLayer.cpp    |  20 +-
 .../CL/functions/CLWinogradInputTransform.cpp      |   9 +-
 src/runtime/CL/functions/CLYOLOLayer.cpp           |   7 +-
 126 files changed, 1698 insertions(+), 742 deletions(-)

(limited to 'src/runtime/CL')

diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index cf77c807cf..492c54e4d3 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLAbsoluteDifference::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
-    k->configure(input1, input2, output);
+    k->configure(compile_context, input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index d529d9402a..a81d1d042b 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -31,22 +31,37 @@
 using namespace arm_compute;
 
 void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, accum);
+}
+
+void CLAccumulate::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLAccumulateKernel>();
-    k->configure(input, accum);
+    k->configure(compile_context, input, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, accum);
+}
+
+void CLAccumulateWeighted::configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLAccumulateWeightedKernel>();
-    k->configure(input, alpha, accum);
+    k->configure(compile_context, input, alpha, accum);
     _kernel = std::move(k);
 }
 
 void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, shift, accum);
+}
+
+void CLAccumulateSquared::configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLAccumulateSquaredKernel>();
-    k->configure(input, shift, accum);
+    k->configure(compile_context, input, shift, accum);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 9882145741..989603a9df 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -36,9 +36,14 @@ CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
 }
 
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
+}
+
+void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
-    k->configure(input, output, act_info);
+    k->configure(compile_context, input, output, act_info);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 4ac6d25d75..5b4c694f33 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,6 +105,11 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
 }
 
 void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
+}
+
+void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _num_of_stages  = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
@@ -121,7 +126,7 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
     // Create temporary tensors
     if(_num_of_stages == 1)
     {
-        _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
+        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op);
     }
     else
     {
@@ -135,22 +140,22 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
 
         // Apply ReductionOperation only on first kernel
         _memory_group.manage(&_results_vector[0]);
-        _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
+        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], axis, op);
+            _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op);
             _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage = _num_of_stages - 1;
-        _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
+        _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
-    _reshape_kernel.configure(&_not_reshaped_output, output);
+    _reshape_kernel.configure(compile_context, &_not_reshaped_output, output);
     _not_reshaped_output.allocator()->allocate();
 }
 
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index f87ea6ea06..9fc51136b3 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,14 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer()
 void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
                                           ActivationLayerInfo act_info)
 {
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
+}
+
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
+                                          const ICLTensor *gamma, float epsilon,
+                                          ActivationLayerInfo act_info)
+{
+    _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 7919b131c8..0a2ae2a6e0 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,12 +39,22 @@ CLBatchToSpaceLayer::CLBatchToSpaceLayer()
 
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(input, block_shape, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
+}
+
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+    _batch_to_space_kernel.configure(compile_context, input, block_shape, output);
 }
 
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(input, block_shape_x, block_shape_y, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output);
+}
+
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+{
+    _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output);
 }
 
 Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 55ee78cd28..1fa80f0a24 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLBitwiseAndKernel>();
-    k->configure(input1, input2, output);
+    k->configure(compile_context, input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index f20363169b..46595191a0 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLBitwiseNotKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index f1d07df1e7..8431140cb8 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLBitwiseOrKernel>();
-    k->configure(input1, input2, output);
+    k->configure(compile_context, input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index e5d3c7a292..0e0e7f2028 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLBitwiseXorKernel>();
-    k->configure(input1, input2, output);
+    k->configure(compile_context, input1, input2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 8669da636e..55bcde749c 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -29,10 +29,15 @@
 namespace arm_compute
 {
 void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
+}
+
+void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
 {
     // Configure Bounding Box kernel
     auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>();
-    k->configure(boxes, pred_boxes, deltas, info);
+    k->configure(compile_context, boxes, pred_boxes, deltas, info);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index d288e42d9b..72c822197c 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 28abaa284b..0c8d3532aa 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -57,6 +57,13 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLI
 
 void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
                             uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, upper_thr, lower_thr, gradient_size, norm_type, border_mode, constant_border_value);
+}
+
+void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
+                            BorderMode border_mode,
+                            uint8_t    constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -111,19 +118,19 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t
     if(gradient_size == 3)
     {
         auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 5)
     {
         auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else if(gradient_size == 7)
     {
         auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
-        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
         _sobel = std::move(k);
     }
     else
@@ -136,7 +143,7 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t
     _memory_group.manage(&_phase);
 
     // Configure gradient
-    _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+    _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -146,14 +153,14 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t
     _memory_group.manage(&_nonmax);
 
     // Configure non-maxima suppression
-    _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+    _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
 
     // Allocate intermediate buffers
     _phase.allocator()->allocate();
 
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+    _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
     // Allocate intermediate buffers
     _mag.allocator()->allocate();
@@ -165,7 +172,7 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t
     _memory_group.manage(&_l1_list_counter);
 
     // Configure edge tracing
-    _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+    _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
 
     // Allocate intermediate buffers
     _visited.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index e3813b3008..7048a79bc5 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
+}
+
+void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(input, output, policy, 0);
+    k->configure(compile_context, input, output, policy, 0);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index c7f2748b0b..249212e03b 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -31,15 +31,25 @@
 using namespace arm_compute;
 
 void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, plane3, output);
+}
+
+void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
-    k->configure(plane0, plane1, plane2, plane3, output);
+    k->configure(compile_context, plane0, plane1, plane2, plane3, output);
     _kernel = std::move(k);
 }
 
 void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), plane0, plane1, plane2, output);
+}
+
+void CLChannelCombine::configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLChannelCombineKernel>();
-    k->configure(plane0, plane1, plane2, output);
+    k->configure(compile_context, plane0, plane1, plane2, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 810103994d..019e0a7a90 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -31,15 +31,25 @@
 using namespace arm_compute;
 
 void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
+}
+
+void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
-    k->configure(input, channel, output);
+    k->configure(compile_context, input, channel, output);
     _kernel = std::move(k);
 }
 
 void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, channel, output);
+}
+
+void CLChannelExtract::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLChannelExtractKernel>();
-    k->configure(input, channel, output);
+    k->configure(compile_context, input, channel, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 2e1192a725..93ab7c7ddf 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -30,9 +30,14 @@
 namespace arm_compute
 {
 void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
+}
+
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
-    k->configure(input, output, num_groups);
+    k->configure(compile_context, input, output, num_groups);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 49aacea788..b8e597751b 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -31,29 +31,49 @@
 using namespace arm_compute;
 
 void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
 void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLColorConvert::configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLColorConvertKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index ac56d5782c..8d5ec3571d 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
+}
+
+void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
-    k->configure(input1, input2, output, operation);
+    k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
     if(output->info()->dimension(0) > 1)
@@ -42,7 +47,7 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -54,9 +59,15 @@ Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *inpu
 
 template <ComparisonOperation COP>
 void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
-    k->configure(input1, input2, output, COP);
+    k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
     if(output->info()->dimension(0) > 1)
@@ -65,7 +76,7 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index b71e89c804..62714fed5c 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -28,10 +28,15 @@
 namespace arm_compute
 {
 void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
+}
+
+void CLComputeAllAnchors::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
 {
     // Configure ComputeAllAnchors kernel
     auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>();
-    k->configure(anchors, all_anchors, info);
+    k->configure(compile_context, anchors, all_anchors, info);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index c85450b2f6..e97256713f 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -49,12 +49,22 @@ CLConcatenateLayer::CLConcatenateLayer()
 
 void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
 {
-    configure_internal(std::move(inputs_vector), output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
+}
+
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(compile_context, std::move(inputs_vector), output, axis);
 }
 
 void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
 {
-    configure_internal(std::move(inputs_vector), output, axis);
+    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
+}
+
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure_internal(compile_context, std::move(inputs_vector), output, axis);
 }
 
 Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
@@ -68,7 +78,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
 }
 
 template <typename TensorType>
-void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
@@ -97,7 +107,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
                 {
                     // Configure WidthConcatenate2Tensors kernel
                     auto kernel = support::cpp14::make_unique<CLWidthConcatenate2TensorsKernel>();
-                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), output);
+                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), output);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
@@ -105,7 +115,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
                 {
                     // Configure WidthConcatenate4Tensors kernel
                     auto kernel = support::cpp14::make_unique<CLWidthConcatenate4TensorsKernel>();
-                    kernel->configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+                    kernel->configure(compile_context, inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
@@ -115,7 +125,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
                     for(unsigned int i = 0; i < _num_inputs; ++i)
                     {
                         auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
-                        kernel->configure(inputs_vector.at(i), offset, output);
+                        kernel->configure(compile_context, inputs_vector.at(i), offset, output);
                         offset += inputs_vector.at(i)->info()->dimension(_axis);
                         _concat_kernels.emplace_back(std::move(kernel));
                     }
@@ -129,7 +139,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
             for(unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
+                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
                 offset += inputs_vector.at(i)->info()->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
@@ -140,7 +150,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
             for(unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
+                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
                 offset += inputs_vector.at(i)->info()->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
@@ -151,7 +161,7 @@ void CLConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_v
             for(unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
+                kernel->configure(compile_context, inputs_vector.at(i), offset, output);
                 offset += inputs_vector.at(i)->info()->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 02927e83ad..68c0fb6ebf 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,9 +27,15 @@ namespace arm_compute
 {
 void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
                                                DataLayout data_layout)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
+}
+
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+                                               DataLayout data_layout)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
-    k->configure(input, output, original_input_shape, data_layout);
+    k->configure(compile_context, input, output, original_input_shape, data_layout);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index b86a08e957..2b0d7d5e53 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -39,11 +39,17 @@
 using namespace arm_compute;
 
 void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
+}
+
+void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+                                 uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
-    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
@@ -55,6 +61,13 @@ CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryMan
 template <unsigned int matrix_size>
 void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
                                                  uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, scale, border_mode, constant_border_value);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+                                                 uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(conv == nullptr);
@@ -75,17 +88,17 @@ void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *ou
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 
         // Allocate intermediate buffer
         _tmp.allocator()->allocate();
     }
     else
     {
-        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
     }
 }
 
@@ -112,9 +125,15 @@ template class arm_compute::CLConvolutionSquare<7>;
 template class arm_compute::CLConvolutionSquare<9>;
 
 void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, conv, rows, cols, scale, border_mode, constant_border_value);
+}
+
+void CLConvolutionRectangle::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale,
+                                       BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
-    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index c271f502e9..b6e1413f7a 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,13 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                    const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+}
+
+void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                   const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
@@ -57,7 +64,7 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
         {
             ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+            f->configure(compile_context, input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
             break;
         }
@@ -65,21 +72,21 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
         {
             ARM_COMPUTE_ERROR_ON(num_groups != 1);
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
-            f->configure(input, weights, biases, output, conv_info, act_info);
+            f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+            f->configure(compile_context, input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::FFT:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLFFTConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, act_info);
+            f->configure(compile_context, input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 3692fda6b2..4c5d62a82c 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -36,9 +36,14 @@
 using namespace arm_compute;
 
 void CLCopy::configure(ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLCopy::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 5e1278df5b..17fc80e146 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/CL/functions/CLCropResize.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/functions/CLCropResize.h"
 
 #include <cstddef>
 
@@ -51,120 +50,10 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
     const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
-
-inline void run_crop(const ICLTensor *input, ICLTensor *output, uint32_t batch_index, Coordinates start, Coordinates end, float extrapolation_value)
-{
-    bool is_width_flipped  = end[0] < start[0];
-    bool is_height_flipped = end[1] < start[1];
-    /** The number of rows out of bounds at the start and end of output. */
-    std::array<int32_t, 2> rows_out_of_bounds{ 0 };
-    /** The number of columns out of bounds at the start and end of output. */
-    std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-    if(is_height_flipped)
-    {
-        rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(start[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
-        rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
-    }
-    else
-    {
-        rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(output->info()->dimension(2))) : 0;
-        rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(input->info()->dimension(2)) ? std::min(end[1] - input->info()->dimension(2) + 1, output->info()->dimension(2)) : 0;
-    }
-    if(is_width_flipped)
-    {
-        cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(start[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
-        cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
-    }
-    else
-    {
-        cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(output->info()->dimension(1))) : 0;
-        cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(input->info()->dimension(1)) ? std::min(end[0] - input->info()->dimension(1) + 1, output->info()->dimension(1)) : 0;
-    }
-
-    Window full_window = calculate_max_window(*output->info());
-
-    //  Full output window:
-    //  --------------------------------
-    //  |          Out of bounds       |
-    //  |          rows before         |
-    //  |------------------------------|
-    //  | Out of | In         | Out of |
-    //  | bounds | bounds     | bounds |
-    //  | cols   | elements   | cols   |
-    //  | before | copied     | after  |
-    //  |        | from input |        |
-    //  |------------------------------|
-    //  |        Out of bounds         |
-    //  |        rows after            |
-    //  |------------------------------|
-    // Use a separate output window for each section of the full output window.
-    // Fill all output rows that have no elements that are within the input bounds
-    // with the extrapolation value using memset.
-    // First for the rows before the in bounds rows.
-    if(rows_out_of_bounds[0] > 0)
-    {
-        Window slice_fill_rows_before(full_window);
-        slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
-        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-        kernel->configure(output, extrapolation_value, &slice_fill_rows_before);
-        CLScheduler::get().enqueue(*kernel);
-    }
-
-    Window slice_in(full_window);
-    slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], output->info()->dimension(2) - rows_out_of_bounds[1], 1));
-    slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], output->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
-    int rows_in_bounds = static_cast<int32_t>(output->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-    if(rows_in_bounds > 0)
-    {
-        // Fill all elements that share a row with an in bounds element with the extrapolation value.
-        if(cols_out_of_bounds[0] > 0)
-        {
-            Window slice_fill_cols_before(slice_in);
-            slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
-            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-            kernel->configure(output, extrapolation_value, &slice_fill_cols_before);
-            CLScheduler::get().enqueue(*kernel);
-        }
-
-        if(cols_out_of_bounds[1] > 0)
-        {
-            Window slice_fill_cols_after(slice_in);
-            slice_fill_cols_after.set(1, Window::Dimension(output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1), 1));
-            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-            kernel->configure(output, extrapolation_value, &slice_fill_cols_after);
-            CLScheduler::get().enqueue(*kernel);
-        }
-
-        // Copy all elements within the input bounds from the input tensor.
-        int cols_in_bounds = static_cast<int32_t>(output->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-        if(cols_in_bounds > 0)
-        {
-            Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-            Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                  is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
-            auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
-
-            kernel->configure(input, output, start_in, end_in, batch_index, extrapolation_value, &slice_in);
-            CLScheduler::get().enqueue(*kernel);
-        }
-    }
-
-    // Fill all rows after the in bounds elements with the extrapolation value.
-    if(rows_out_of_bounds[1] > 0)
-    {
-        Window slice_fill_rows_after(full_window);
-        slice_fill_rows_after.set(2, Window::Dimension(output->info()->dimension(2) - rows_out_of_bounds[1], output->info()->dimension(2), 1));
-        auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-        kernel->configure(output, extrapolation_value, &slice_fill_rows_after);
-        CLScheduler::get().enqueue(*kernel);
-    }
-}
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results()
+    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_kernels()
 {
 }
 
@@ -190,9 +79,18 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
 void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
                              InterpolationPolicy method, float extrapolation_value)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+}
+
+void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
+                             InterpolationPolicy method, float extrapolation_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
     ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
 
+    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
+
     _num_boxes = boxes->info()->tensor_shape()[1];
     TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
 
@@ -210,7 +108,13 @@ void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor
     // - A scale function is used to resize the cropped image to the size specified by crop_size.
     // - A tensor is required to hold the final scaled image before it is copied into the 4D output
     //   that will hold all final cropped and scaled 3D images using CLCopyKernel.
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+
+    // The contents of _boxes and _box_ind are required to calculate the shape
+    // of the initial cropped image and thus are required to configure the
+    // kernels used for cropping and scaling.
+    _boxes->map(CLScheduler::get().queue());
+    _box_ind->map(CLScheduler::get().queue());
+    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
         auto       crop_tensor = support::cpp14::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -223,45 +127,149 @@ void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor
         scaled_result_info.set_data_layout(DataLayout::NHWC);
         scale_tensor->allocator()->init(scaled_result_info);
         _scaled_results.emplace_back(std::move(scale_tensor));
-    }
-}
 
-void CLCropResize::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
-    // The contents of _boxes and _box_ind are required to calculate the shape
-    // of the initial cropped image and thus are required to configure the
-    // kernels used for cropping and scaling.
-    _boxes->map(CLScheduler::get().queue());
-    _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int i = 0; i < _num_boxes; ++i)
-    {
-        // Size of the crop box in _boxes and thus the shape of _crop_results[i]
-        // may not be known until run-time and so the kernels cannot be configured until then.
+        // Size of the crop box in _boxes has to be given before the configure
         uint32_t    batch_index;
         Coordinates start{};
         Coordinates end{};
-        configure_crop(_input, _boxes, _box_ind, _crop_results[i].get(), i, start, end, batch_index);
+        configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = support::cpp14::make_unique<CLScale>();
-        scale_kernel->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
-        win.set(3, Window::Dimension(i, i + 1, 1));
+        win.set(3, Window::Dimension(num_box, num_box + 1, 1));
 
         auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
-        copy_kernel->configure(_scaled_results[i].get(), _output, PaddingList(), &win);
+        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win);
         _copy.emplace_back(std::move(copy_kernel));
 
-        _crop_results[i]->allocator()->allocate();
-        _scaled_results[i]->allocator()->allocate();
+        _crop_results[num_box]->allocator()->allocate();
+        _scaled_results[num_box]->allocator()->allocate();
+
+        bool is_width_flipped  = end[0] < start[0];
+        bool is_height_flipped = end[1] < start[1];
+        /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
+        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
+        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
+        if(is_height_flipped)
+        {
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+        }
+        else
+        {
+            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+        }
+        if(is_width_flipped)
+        {
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+        }
+        else
+        {
+            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+        }
+
+        Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
+
+        //  Full _crop_results[num_box].get() window:
+        //  --------------------------------
+        //  |          Out of bounds       |
+        //  |          rows before         |
+        //  |------------------------------|
+        //  | Out of | In         | Out of |
+        //  | bounds | bounds     | bounds |
+        //  | cols   | elements   | cols   |
+        //  | before | copied     | after  |
+        //  |        | from input |        |
+        //  |------------------------------|
+        //  |        Out of bounds         |
+        //  |        rows after            |
+        //  |------------------------------|
+        // Use a separate _crop_results[num_box].get() window for each section of the full _crop_results[num_box].get() window.
+        // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
+        // with the extrapolation value using memset.
+        // First for the rows before the in bounds rows.
+        if(rows_out_of_bounds[0] > 0)
+        {
+            Window slice_fill_rows_before(full_window);
+            slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+            _internal_kernels.push_back(std::move(kernel));
+        }
+
+        Window slice_in(full_window);
+        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if(rows_in_bounds > 0)
+        {
+            // Fill all elements that share a row with an in bounds element with the extrapolation value.
+            if(cols_out_of_bounds[0] > 0)
+            {
+                Window slice_fill_cols_before(slice_in);
+                slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
+                auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+                _internal_kernels.push_back(std::move(kernel));
+            }
 
-        run_crop(_input, _crop_results[i].get(), batch_index, start, end, _extrapolation_value);
+            if(cols_out_of_bounds[1] > 0)
+            {
+                Window slice_fill_cols_after(slice_in);
+                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+                auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+                _internal_kernels.push_back(std::move(kernel));
+            }
+
+            // Copy all elements within the input bounds from the input tensor.
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if(cols_in_bounds > 0)
+            {
+                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
+                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+                auto kernel = arm_compute::support::cpp14::make_unique<CLCropKernel>();
+
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+                _internal_kernels.push_back(std::move(kernel));
+            }
+        }
+
+        // Fill all rows after the in bounds elements with the extrapolation value.
+        if(rows_out_of_bounds[1] > 0)
+        {
+            Window slice_fill_rows_after(full_window);
+            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+            auto kernel = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+            _internal_kernels.push_back(std::move(kernel));
+        }
     }
     _boxes->unmap(CLScheduler::get().queue());
     _box_ind->unmap(CLScheduler::get().queue());
     CLScheduler::get().sync();
+}
+
+void CLCropResize::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
+
+    for(unsigned int i = 0; i < _internal_kernels.size(); ++i)
+    {
+        CLScheduler::get().enqueue(*(_internal_kernels[i]));
+    }
+
+    CLScheduler::get().sync();
     for(auto &kernel : _scale)
     {
         kernel->run();
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 7aa771428d..62e7d9a582 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,12 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 
 void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
                                      const WeightsInfo &weights_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
+}
+
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -51,14 +57,14 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const
         case DeconvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLDirectDeconvolutionLayer>();
-            f->configure(input, weights, bias, output, deconv_info, weights_info);
+            f->configure(compile_context, input, weights, bias, output, deconv_info, weights_info);
             _function = std::move(f);
             break;
         }
         case DeconvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
-            f->configure(input, weights, bias, output, deconv_info);
+            f->configure(compile_context, input, weights, bias, output, deconv_info);
             _function = std::move(f);
             break;
         }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index eaf7c66083..be2d120dcd 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,12 +43,17 @@ Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const IT
 }
 
 void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+}
+
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
-    _upsample.configure(input, _output, info);
+    _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _upsample.configure(compile_context, input, _output, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 88e3cbcc17..b848f989e6 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
+}
+
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
-    k->configure(input, output, policy, shift);
+    k->configure(compile_context, input, output, policy, shift);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 5af979539a..89e5faa4d5 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
+}
+
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceLayerKernel>();
-    k->configure(input, output, block_shape);
+    k->configure(compile_context, input, output, block_shape);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 00f6f69771..0b7a33401d 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -125,7 +125,14 @@ CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<I
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                                ActivationLayerInfo act_info, const Size2D &dilation)
 {
-    _func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    _func.configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                               const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                               ActivationLayerInfo act_info, const Size2D &dilation)
+{
+    _func.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 }
 
 Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -166,6 +173,13 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv
 
 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+                                                                                ICLTensor *output, const PadStrideInfo &conv_info,
+                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
@@ -193,11 +207,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
         _memory_group.manage(&_permuted_output);
 
         // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
         _permuted_input.info()->set_data_layout(DataLayout::NHWC);
 
         // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
         _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
 
         // Set output quantization info before dwc kernel configure
@@ -226,7 +240,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
     dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
     DWCKernelInfo dwc_info;
     dwc_info.activation_info = act_info;
-    _dwc_native_kernel.configure(input_to_use, weights_to_use, biases, output_to_use,
+    _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
                                  dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
                                  output_multipliers_to_use, output_shifts_to_use);
 
@@ -236,7 +250,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
 
         // Configure the function to transform the convoluted output to NCHW format
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-        _permute_output_to_nchw.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+        _permute_output_to_nchw.configure(compile_context, &_permuted_output, output, PermutationVector(1U, 2U, 0U));
         _permuted_output.allocator()->allocate();
     }
 
@@ -385,6 +399,13 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwise
 
 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                                                                     const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
+                                                                                    ICLTensor           *output,
+                                                                                    const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
 {
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -429,11 +450,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
         _memory_group.manage(&_permuted_output);
 
         // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permute_input_to_nchw.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
         // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
         _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
@@ -447,7 +468,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         if(_needs_weights_reshape)
         {
-            _reshape_weights.configure(weights, &_permuted_weights, info);
+            _reshape_weights.configure(compile_context, weights, &_permuted_weights, info);
             weights_to_use = &_permuted_weights;
         }
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
@@ -473,7 +494,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
 
     // Configure kernel
     _kernel->set_target(gpu_target);
-    _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
+    _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
                        act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
 
     if(_is_quantized)
@@ -487,7 +508,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-        _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+        _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
 
         // Allocate tensors
         _permuted_input.allocator()->allocate();
@@ -499,7 +520,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
-    _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -574,6 +595,14 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                             ActivationLayerInfo act_info, const Size2D &dilation)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+}
+
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int        depth_multiplier,
+                                            ActivationLayerInfo act_info, const Size2D &dilation)
 {
     const GPUTarget gpu_target = CLScheduler::get().target();
     _depth_conv_func           = get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info,
@@ -582,12 +611,12 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_3x3.set_memory_group(_memory_manager);
-            _func_3x3.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
         {
             _func_generic.set_memory_group(_memory_manager);
-            _func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
         }
         break;
         default:
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 6e4aab2eb9..362b36cc95 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -29,9 +29,14 @@
 namespace arm_compute
 {
 void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDequantizationLayerKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index 799724bf2d..68d3752463 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
-    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 5962c1dc88..05351a9de3 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 9828cd10ec..6e9782f77a 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -38,12 +38,19 @@ CLDirectConvolutionLayer::CLDirectConvolutionLayer()
 }
 
 void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                         const PadStrideInfo &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     // Set GPU target
     _direct_conv_kernel.set_target(CLScheduler::get().target());
 
     // Configure direct convolution
-    _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
+    _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info);
 
     // Configure border handler
     PixelValue &&zero_value(0.f);
@@ -51,7 +58,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig
     {
         zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
     }
-    _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
 
     // Tune kernels
     CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
@@ -61,7 +68,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig
     //Configure Activation Layer
     if(_is_activationlayer_enabled)
     {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _activationlayer_function.configure(compile_context, output, nullptr, act_info);
     }
 }
 
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 8d90723c95..da16bed3e0 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,10 +87,10 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
 
-    unsigned int        deconv_pad_x = 0;
-    unsigned int        deconv_pad_y = 0;
-    const unsigned int  stride_x = info.stride().first;
-    const unsigned int  stride_y = info.stride().second;
+    unsigned int        deconv_pad_x    = 0;
+    unsigned int        deconv_pad_y    = 0;
+    const unsigned int  stride_x        = info.stride().first;
+    const unsigned int  stride_y        = info.stride().second;
     const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
     TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
@@ -103,6 +103,12 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
 
 void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
                                            const WeightsInfo &weights_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
+}
+
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                           const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -110,8 +116,8 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
     const unsigned int pad_right  = info.pad_right();
     const unsigned int pad_top    = info.pad_top();
     const unsigned int pad_bottom = info.pad_bottom();
-    const unsigned int stride_x = info.stride().first;
-    const unsigned int stride_y = info.stride().second;
+    const unsigned int stride_x   = info.stride().first;
+    const unsigned int stride_y   = info.stride().second;
 
     const DataLayout data_layout = input->info()->data_layout();
 
@@ -121,7 +127,7 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
     _original_weights = weights;
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+    _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
     auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
@@ -146,14 +152,14 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
     deconv_pad_x -= deconv_pad_left + deconv_pad_right;
     ARM_COMPUTE_ERROR_ON((deconv_pad_x % 2) != 0);
-    deconv_pad_left  += deconv_pad_x / 2;
+    deconv_pad_left += deconv_pad_x / 2;
     deconv_pad_right += deconv_pad_x / 2;
 
     unsigned int deconv_pad_top    = pad_bottom > pad_top ? pad_bottom - pad_top : 0;
     unsigned int deconv_pad_bottom = pad_top > pad_bottom ? pad_top - pad_bottom : 0;
     deconv_pad_y -= deconv_pad_top + deconv_pad_bottom;
     ARM_COMPUTE_ERROR_ON((deconv_pad_y % 2) != 0);
-    deconv_pad_top    += deconv_pad_y / 2;
+    deconv_pad_top += deconv_pad_y / 2;
     deconv_pad_bottom += deconv_pad_y / 2;
 
     TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
@@ -162,11 +168,11 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
 
     // configure scale function
     const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
-    _scale_f.configure(input, &_scaled_output, upsample_info);
+    _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+    _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
     _scaled_output.allocator()->allocate();
 
     // Setup flip axis data
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index 9955d240f9..ce615327a9 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::RSQRT);
+    k->configure(compile_context, input, output, ElementWiseUnary::RSQRT);
     _kernel = std::move(k);
 }
 Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -42,9 +47,14 @@ Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *outpu
 }
 
 void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::EXP);
+    k->configure(compile_context, input, output, ElementWiseUnary::EXP);
     _kernel = std::move(k);
 }
 Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -53,9 +63,14 @@ Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 }
 
 void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::NEG);
+    k->configure(compile_context, input, output, ElementWiseUnary::NEG);
     _kernel = std::move(k);
 }
 Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -64,9 +79,14 @@ Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 }
 
 void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::SIN);
+    k->configure(compile_context, input, output, ElementWiseUnary::SIN);
     _kernel = std::move(k);
 }
 Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -75,9 +95,14 @@ Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 }
 
 void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::ABS);
+    k->configure(compile_context, input, output, ElementWiseUnary::ABS);
     _kernel = std::move(k);
 }
 Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -85,9 +110,14 @@ Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
     return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
 }
 void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::LOG);
+    k->configure(compile_context, input, output, ElementWiseUnary::LOG);
     _kernel = std::move(k);
 }
 Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
@@ -96,9 +126,14 @@ Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 }
 
 void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(input, output, ElementWiseUnary::ROUND);
+    k->configure(compile_context, input, output, ElementWiseUnary::ROUND);
     _kernel = std::move(k);
 }
 Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 7636a87e93..20e9545b61 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -33,7 +33,7 @@ namespace arm_compute
 {
 namespace
 {
-void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
 {
     if(output->info()->dimension(0) > 1)
     {
@@ -41,18 +41,23 @@ void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize bor
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
         }
     }
 }
 } // namespace
 
 void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
+    k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
@@ -61,11 +66,16 @@ Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn
 }
 
 void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
+    k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
@@ -75,11 +85,16 @@ Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso
 }
 
 void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::DIV, input1, input2, output, act_info);
+    k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -88,11 +103,16 @@ Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorIn
 }
 
 void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MAX, input1, input2, output, act_info);
+    k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -101,11 +121,16 @@ Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *
 }
 
 void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MIN, input1, input2, output, act_info);
+    k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -114,11 +139,16 @@ Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *
 }
 
 void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
+    k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -127,11 +157,16 @@ Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITens
 }
 
 void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::POWER, input1, input2, output, act_info);
+    k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index a0663b754a..e1bd7e6f2a 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,9 +89,14 @@ CLEqualizeHistogram::CLEqualizeHistogram()
 
 void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
 {
-    _histogram_kernel.configure(input, &_hist);
-    _border_histogram_kernel.configure(input, &_hist);
-    _map_histogram_kernel.configure(input, &_cd_lut, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
+{
+    _histogram_kernel.configure(compile_context, input, &_hist);
+    _border_histogram_kernel.configure(compile_context, input, &_hist);
+    _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output);
 }
 
 void CLEqualizeHistogram::run()
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index 48d7be385d..8106148316 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 49b5a2a2e6..c3922f5e66 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,11 @@ CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
+}
+
+void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -57,7 +62,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
@@ -72,7 +77,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
@@ -83,7 +88,7 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index f5776cbd88..2482ea901a 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -35,6 +35,11 @@ CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
+}
+
+void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -44,13 +49,13 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn
     first_pass_config.axis      = config.axis0;
     first_pass_config.direction = config.direction;
     _memory_group.manage(&_first_pass_tensor);
-    _first_pass_func.configure(input, &_first_pass_tensor, first_pass_config);
+    _first_pass_func.configure(compile_context, input, &_first_pass_tensor, first_pass_config);
 
     // Setup second pass
     FFT1DInfo second_pass_config;
     second_pass_config.axis      = config.axis1;
     second_pass_config.direction = config.direction;
-    _second_pass_func.configure(&_first_pass_tensor, output, second_pass_config);
+    _second_pass_func.configure(compile_context, &_first_pass_tensor, output, second_pass_config);
     _first_pass_tensor.allocator()->allocate();
 }
 
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index afb1cab520..ff439cca8d 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,6 +97,12 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 
 void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                       const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
+}
+
+void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                      const ActivationLayerInfo &act_info)
 {
     _original_weights = weights;
     _original_bias    = biases;
@@ -121,7 +127,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     // Permute bias
     if(biases != nullptr)
     {
-        _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
+        _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
     }
 
@@ -131,11 +137,11 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input_func.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permute_input_func.configure(compile_context, input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
         // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights_func.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+        _permute_weights_func.configure(compile_context, weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
         _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
 
         input_to_use   = &_permuted_input;
@@ -145,20 +151,20 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     // Flip weights
     _flipped_weights.allocator()->init(weights_to_use->info()->clone()->set_is_resizable(true).reset_padding());
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
-    _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
+    _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
     const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
-    _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
+    _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
     _transform_weights_func = support::cpp14::make_unique<CLFFT2D>();
-    _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
+    _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
     const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
     _memory_group.manage(&_padded_input);
-    _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
+    _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
     if(_needs_permute)
     {
         _permuted_input.allocator()->allocate();
@@ -166,17 +172,17 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
 
     // Transform input
     _memory_group.manage(&_transformed_input);
-    _transform_input_func.configure(&_padded_input, &_transformed_input, FFT2DInfo());
+    _transform_input_func.configure(compile_context, &_padded_input, &_transformed_input, FFT2DInfo());
     _padded_input.allocator()->allocate();
 
     // Perform product
     _memory_group.manage(&_output_product);
-    _prod_func.configure(&_transformed_input, &_transformed_weights, &_output_product);
+    _prod_func.configure(compile_context, &_transformed_input, &_transformed_weights, &_output_product);
     _transformed_input.allocator()->allocate();
 
     // Perform reduction
     _memory_group.manage(&_output_reduced);
-    _reduce_func.configure(&_output_product, &_output_reduced, 2, ReductionOperation::SUM);
+    _reduce_func.configure(compile_context, &_output_product, &_output_reduced, 2, ReductionOperation::SUM);
     _output_product.allocator()->allocate();
 
     // Transform output
@@ -184,7 +190,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
     _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
-    _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
+    _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
     // Reshape output
@@ -206,7 +212,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
@@ -219,7 +225,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
             _memory_group.manage(&_permuted_output);
         }
         auto_init_if_empty(*output_to_use->info(), *_bias_output.info());
-        _bias_add_func.configure(&_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
+        _bias_add_func.configure(compile_context, &_bias_output, &_permuted_bias, output_to_use, ConvertPolicy::WRAP);
         _bias_output.allocator()->allocate();
     }
 
@@ -228,7 +234,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-        _permute_output_func.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+        _permute_output_func.configure(compile_context, &_permuted_output, output, PermutationVector(2U, 0U, 1U));
 
         // Allocate tensors
         _permuted_output.allocator()->allocate();
@@ -238,7 +244,7 @@ void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights
     _is_activationlayer_enabled = act_info.enabled();
     if(_is_activationlayer_enabled)
     {
-        _activation_layer_func.configure(output, nullptr, act_info);
+        _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
 
     // Setup flip axis data
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index fe2a18cd30..f51abf0880 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,12 @@ CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
 
 void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
                               unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, nonmax_suppression, corners, num_corners, border_mode, constant_border_value);
+}
+
+void CLFastCorners::configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
+                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
@@ -72,19 +78,19 @@ void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonma
     const bool update_number = (nullptr != _num_corners);
 
     _memory_group.manage(&_output);
-    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
+    _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
 
     if(!_non_max)
     {
-        _copy_array_kernel.configure(&_output, update_number, _corners, &_num_buffer);
+        _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer);
     }
     else
     {
         _suppr.allocator()->init(tensor_info);
         _memory_group.manage(&_suppr);
 
-        _suppr_func.configure(&_output, &_suppr, border_mode);
-        _copy_array_kernel.configure(&_suppr, update_number, _corners, &_num_buffer);
+        _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
+        _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
 
         _suppr.allocator()->allocate();
     }
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 035bb7ce8d..7b96ed1592 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,11 +30,15 @@
 
 namespace arm_compute
 {
-
 void CLFill::configure(ICLTensor *tensor, PixelValue constant_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value);
+}
+
+void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLMemsetKernel>();
-    k->configure(tensor, constant_value);
+    k->configure(compile_context, tensor, constant_value);
     _kernel = std::move(k);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index b7d77bcbb7..f9d7396c5b 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value);
+}
+
+void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLFillBorderKernel>();
-    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
+    k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 8f93e03c5e..9a247ccfcb 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -30,9 +30,14 @@
 using namespace arm_compute;
 
 void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLFlattenLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 204ca7400c..44e1d39dc2 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -33,7 +33,7 @@ void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLFloor::configure(CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLFloor::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
     k->configure(compile_context, input, output);
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 0c0fbd5c9d..ecbac6f703 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -146,9 +146,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 } // namespace
 
 void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
@@ -163,7 +168,8 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> mem
       _are_weights_reshaped(true), _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
 {
 }
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                         const FullyConnectedLayerInfo &fc_info)
 {
     GEMMLowpOutputStageInfo gemmlowp_output_stage;
     construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), gemmlowp_output_stage, fc_info.activation_info);
@@ -190,7 +196,7 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Configure gemmlowp function
-        _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
+        _mm_gemmlowp.configure(compile_context, input, weights, bias, output, gemm_info);
 
         // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -199,11 +205,12 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor
     else
     {
         // Configure matrix multiply kernel
-        _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
+        _mm_gemm.configure(compile_context, input, weights, bias, output, 1.f, 1.f, gemm_info);
     }
 }
 
-void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                              const FullyConnectedLayerInfo &fc_info)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
@@ -215,25 +222,32 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT
 
     // Configure flatten kernel
     _memory_group.manage(&_flatten_output);
-    _flatten_layer.configure(input, &_flatten_output);
+    _flatten_layer.configure(compile_context, input, &_flatten_output);
 
     // Configure matrix multiply kernel
-    configure_mm(&_flatten_output, weights, bias, output, fc_info);
+    configure_mm(compile_context, &_flatten_output, weights, bias, output, fc_info);
 
     // Allocate the output tensor for flatten once all the configure methods have been called
     _flatten_output.allocator()->allocate();
 }
 
-void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info)
+void CLFullyConnectedLayer::configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                            const FullyConnectedLayerInfo &fc_info)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    configure_mm(input, weights, bias, output, fc_info);
+    configure_mm(compile_context, input, weights, bias, output, fc_info);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                       FullyConnectedLayerInfo fc_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
+}
+
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                      FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -282,13 +296,13 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
     {
         if(_weights_manager && _weights_manager->are_weights_managed(weights))
         {
-            _reshape_weights_managed_function.configure(weights);
+            _reshape_weights_managed_function.configure(compile_context, weights);
             weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed_function));
         }
         else
         {
             // Reshape the weights
-            _reshape_weights_function.configure(weights, &_reshape_weights_output);
+            _reshape_weights_function.configure(compile_context, weights, &_reshape_weights_output);
             weights_to_use = &_reshape_weights_output;
         }
     }
@@ -298,7 +312,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
     {
         if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
         {
-            _convert_weights_managed.configure(weights_to_use,
+            _convert_weights_managed.configure(compile_context, weights_to_use,
                                                input->info()->tensor_shape(),
                                                fc_info.weights_trained_layout);
             weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_convert_weights_managed));
@@ -306,7 +320,7 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
         else
         {
             // Convert weights
-            _convert_weights.configure(weights_to_use,
+            _convert_weights.configure(compile_context, weights_to_use,
                                        &_converted_weights_output,
                                        input->info()->tensor_shape(),
                                        fc_info.weights_trained_layout);
@@ -319,12 +333,12 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
     if(_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, biases, output, fc_info);
+        configure_conv_fc(compile_context, input, weights_to_use, biases, output, fc_info);
     }
     else
     {
         // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, biases, output, fc_info);
+        configure_fc_fc(compile_context, input, weights_to_use, biases, output, fc_info);
     }
 }
 
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 72dd27e3cc..6deecdc089 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,15 @@ void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const I
                                          const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+}
+
+void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
+                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+                                         float epsilon, FuseBatchNormalizationType fbn_type)
+{
+    _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 74d59cdad1..8466024c04 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -81,7 +81,8 @@ CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsi
     return gemm_kernel->select_kernel(params);
 }
 
-void CLGEMM::configure_native_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+                                 const GEMMInfo &gemm_info)
 {
     const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n          = b->info()->dimension(0);
@@ -94,13 +95,14 @@ void CLGEMM::configure_native_v1(const ICLTensor *a, const ICLTensor *b, const I
     GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     // Tune kernel statically
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 }
 
-void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+                                   const GEMMInfo &gemm_info)
 {
     bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
@@ -148,22 +150,22 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const
     }
 
     // Configure interleave kernel
-    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
+    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
     // Configure transpose kernel
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
         reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
     }
     else
     {
-        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
@@ -176,7 +178,8 @@ void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const
     }
 }
 
-void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+                                   const GEMMInfo &gemm_info)
 {
     DataType           data_type               = a->info()->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
@@ -223,21 +226,21 @@ void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const IC
     // Configure lhs_info and rhs_info
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
-    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
         reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
     }
     else
     {
-        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(&_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -248,7 +251,8 @@ void CLGEMM::configure_reshaped(const ICLTensor *a, const ICLTensor *b, const IC
     }
 }
 
-void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
+                                         const GEMMInfo &gemm_info)
 {
     DataType           data_type               = a->info()->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
@@ -293,16 +297,16 @@ void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b,
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(b, rhs_info);
+        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
         reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
     }
     else
     {
-        _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     if(!_reshape_b_only_on_first_run && use_mm_b)
     {
@@ -483,6 +487,11 @@ Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
 }
 
 void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
+}
+
+void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -511,22 +520,22 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     {
         case CLGEMMKernelType::NATIVE_V1:
         {
-            configure_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
+            configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case CLGEMMKernelType::RESHAPED_V1:
         {
-            configure_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info);
+            configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case CLGEMMKernelType::RESHAPED:
         {
-            configure_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info);
+            configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
         {
-            configure_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info);
+            configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
             break;
         }
         default:
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 5398050533..1c37993bda 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -47,6 +47,11 @@ CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
 }
 
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
@@ -58,7 +63,7 @@ void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(weights, biases_to_use, output, num_groups);
+    _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
@@ -100,7 +105,8 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
 {
 }
 
-void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                          const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
                                           int gemm_3d_depth, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
@@ -127,7 +133,7 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso
         input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
-        _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
+        _mm_gemmlowp.configure(compile_context, input, weights, biases, output, gemm_info);
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -136,7 +142,7 @@ void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTenso
     else
     {
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, biases, output, 1.0f, 1.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input, weights, biases, output, 1.0f, 1.0f, gemm_info);
     }
 }
 
@@ -180,6 +186,13 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
 
 void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                        const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+}
+
+void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                       const PadStrideInfo &conv_info,
+                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -252,24 +265,24 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
 
         if(_weights_manager && _weights_manager->are_weights_managed(weights))
         {
-            _reshape_weights_managed.configure(weights, biases, num_groups);
+            _reshape_weights_managed.configure(compile_context, weights, biases, num_groups);
             weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
         }
         else
         {
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, num_groups);
+            _reshape_weights.configure(compile_context, weights, biases, &_weights_reshaped, num_groups);
         }
     }
     else
     {
         if(_weights_manager && _weights_manager->are_weights_managed(weights))
         {
-            _reshape_weights_managed.configure(weights, nullptr, num_groups);
+            _reshape_weights_managed.configure(compile_context, weights, nullptr, num_groups);
             weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(weights, &_reshape_weights_managed));
         }
         else
         {
-            _reshape_weights.configure(weights, nullptr, &_weights_reshaped, num_groups);
+            _reshape_weights.configure(compile_context, weights, nullptr, &_weights_reshaped, num_groups);
         }
     }
 
@@ -279,7 +292,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
         _memory_group.manage(&_im2col_output);
 
         // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
+        _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
 
         // Set quantization info
         _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
@@ -367,7 +380,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
+    configure_mm(compile_context, gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, act_info);
 
     if(!_skip_im2col)
     {
@@ -377,7 +390,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
     if(!_skip_col2im)
     {
         // Configure and tune Col2Im
-        _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+        _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
         CLScheduler::get().tune_kernel_static(_col2im_kernel);
     }
 
@@ -391,7 +404,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
 
     if(!_fuse_activation)
     {
-        _activationlayer_function.configure(output, nullptr, act_info);
+        _activationlayer_function.configure(compile_context, output, nullptr, act_info);
     }
 
     ARM_COMPUTE_UNUSED(weights_info);
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 3298858215..1dcb341fe7 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "utils/TypePrinter.h"
 
 #include <memory>
 #include <tuple>
@@ -64,29 +63,29 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
 }
 Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
 {
-        const auto data_type = input->data_type();
+    const auto data_type = input->data_type();
 
-        if(is_data_type_quantized_asymmetric(data_type))
-        {
-            const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
-            const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-            const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
-            float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-            int   output_multiplier(0);
-            int   output_shift(0);
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
-            output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-            output_stage_info.gemmlowp_multiplier = output_multiplier;
-            output_stage_info.gemmlowp_shift      = output_shift;
-            output_stage_info.gemmlowp_offset     = oq_info.offset;
-            const auto min_max_bound              = get_min_max(data_type);
-            output_stage_info.gemmlowp_min_bound  = (std::get<0>(min_max_bound)).get<int32_t>();
-            output_stage_info.gemmlowp_max_bound  = (std::get<1>(min_max_bound)).get<int32_t>();
-            output_stage_info.output_data_type    = data_type;
-        }
-        return Status{};
+    if(is_data_type_quantized_asymmetric(data_type))
+    {
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+
+        float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
+        int   output_multiplier(0);
+        int   output_shift(0);
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+        output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_stage_info.gemmlowp_multiplier = output_multiplier;
+        output_stage_info.gemmlowp_shift      = output_shift;
+        output_stage_info.gemmlowp_offset     = oq_info.offset;
+        const auto min_max_bound              = get_min_max(data_type);
+        output_stage_info.gemmlowp_min_bound  = (std::get<0>(min_max_bound)).get<int32_t>();
+        output_stage_info.gemmlowp_max_bound  = (std::get<1>(min_max_bound)).get<int32_t>();
+        output_stage_info.output_data_type    = data_type;
+    }
+    return Status{};
 }
 
 } // namespace
@@ -175,7 +174,6 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
                                                                            gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
-
     }
     else
     {
@@ -214,6 +212,12 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
 }
 
 void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
+}
+
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                         const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
@@ -237,9 +241,9 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
     if(_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
-        _permute_input_to_nhwc.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+        _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
 
-        _permute_weights_to_nhwc.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+        _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
 
         input_to_use   = &_permuted_input;
         weights_to_use = &_permuted_weights;
@@ -251,8 +255,8 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
                                                    1,
                                                    input->info()->data_type(), weights->info()->quantization_info()));
 
-    _reshape_weights.configure(weights_to_use, &_reshaped_weights);
-    _transpose_weights.configure(&_reshaped_weights, &_reshaped_weights_t);
+    _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
+    _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
 
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
@@ -268,14 +272,14 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
         input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
         _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
-        _mm_gemmlowp.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
+        _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
         input_to_use->info()->set_quantization_info(iq_info);
         _reshaped_weights_t.info()->set_quantization_info(wq_info);
     }
     else
     {
-        _mm_gemm.configure(input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
     }
 
     if(_is_nchw)
@@ -313,14 +317,14 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape.configure(&_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
     if(_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(&_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
@@ -328,7 +332,7 @@ void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor
     if(_padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
-        _slice_gemm.configure(&_slice_gemm_input, slice_output, start_end.first, start_end.second);
+        _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
 }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 3465da95b7..84da4a7e98 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -100,6 +100,11 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 }
 
 void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
@@ -144,7 +149,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         TensorInfo weights_info(*b->info());
         weights_info.set_data_type(DataType::QASYMM8);
         _qasymm8_weights.allocator()->init(weights_info);
-        _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+        _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
     }
 
     const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
@@ -162,7 +167,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
 
     // Using default reduction info
@@ -179,7 +184,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         }
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -190,7 +195,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         _memory_group.manage(&_vector_sum_row);
 
         // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, reduction_info);
+        _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -220,7 +225,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
@@ -231,7 +236,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
 
             if(_is_gemm_reshaped)
             {
-                _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
             }
             else
             {
@@ -239,11 +244,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
                 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
 
-                _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+                _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
+                                                                   a->info()->dimension(0),
                                                                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
-
                 _mm_result_s32.allocator()->allocate();
             }
         }
@@ -264,7 +269,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info);
+            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
         }
         else
         {
@@ -272,11 +277,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
             std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
             // Configure matrix multiply kernel
-            _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
+        _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
+                                              _b_offset);
     }
 
     // Allocate tensors
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index aff7f54a82..18e002aa3d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -43,7 +43,23 @@ void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input,
     info.gemmlowp_max_bound      = max;
 
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(input, bias, output, &info);
+    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
+    _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset,
+                                                        int result_mult_int,
+                                                        int result_shift, int min, int max)
+{
+    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
+    info.gemmlowp_offset         = result_offset;
+    info.gemmlowp_multiplier     = result_mult_int;
+    info.gemmlowp_shift          = result_shift;
+    info.gemmlowp_min_bound      = min;
+    info.gemmlowp_max_bound      = max;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
+    k->configure(compile_context, input, bias, output, &info);
     _kernel = std::move(k);
 }
 
@@ -59,9 +75,16 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *inpu
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                     int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
                                                                     int min, int max)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+                                                                    int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
     _kernel = std::move(k);
 }
 
@@ -76,7 +99,16 @@ void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTens
                                                                    int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+                                                                   int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
     _kernel = std::move(k);
 }
 
@@ -97,7 +129,22 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *
     info.gemmlowp_max_bound       = max;
 
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-    k->configure(input, bias, output, &info);
+    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
+    _kernel = std::move(k);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                               float multiplier, int offset,
+                                                               int min, int max)
+{
+    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
+    info.gemmlowp_offset          = offset;
+    info.gemmlowp_real_multiplier = multiplier;
+    info.gemmlowp_min_bound       = min;
+    info.gemmlowp_max_bound       = max;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
+    k->configure(compile_context, input, bias, output, &info);
     _kernel = std::move(k);
 }
 
@@ -113,9 +160,16 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInf
 void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                     int result_fixedpoint_multiplier, int result_shift,
                                                                     int min, int max)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                    int result_fixedpoint_multiplier, int result_shift,
+                                                                    int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
     _kernel = std::move(k);
 }
 
@@ -126,6 +180,11 @@ Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITens
 }
 
 void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
+}
+
+void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -138,14 +197,14 @@ void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *b
                 case DataType::QASYMM8:
                 {
                     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
@@ -164,14 +223,14 @@ void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *b
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
             auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-            k->configure(input, bias, output, &info);
+            k->configure(compile_context, input, bias, output, &info);
             _kernel = std::move(k);
             break;
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
         {
             auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-            k->configure(input, bias, output, &info);
+            k->configure(compile_context, input, bias, output, &info);
             _kernel = std::move(k);
             break;
         }
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 08ec4e02d1..e2b18e0f55 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -30,9 +30,14 @@
 namespace arm_compute
 {
 void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
+}
+
+void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
-    k->configure(input, indices, output, axis);
+    k->configure(compile_context, input, indices, output, axis);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index c187891b12..47367c4b17 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index ea803e4796..6b82cd0c35 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,11 @@ CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
 
@@ -50,9 +55,9 @@ void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode bo
     _memory_group.manage(&_tmp);
 
     // Configure kernels
-    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 3cda1be2df..1ac98787ac 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -56,6 +56,11 @@ CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
 }
 
 void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
+}
+
+void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
@@ -90,16 +95,16 @@ void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, Bord
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
         }
         _tmp.allocate();
     }
@@ -136,6 +141,11 @@ CLGaussianPyramidOrb::CLGaussianPyramidOrb() // NOLINT
 }
 
 void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
+}
+
+void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
@@ -162,10 +172,10 @@ void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, Borde
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure gaussian 5x5 */
-            _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+            _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
+            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
         }
 
         _tmp.allocate();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 7d16753320..7f037fc51f 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -63,6 +63,13 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 
 void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+}
+
+void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
+                                         ICLTensor *scores_out,
+                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
@@ -84,7 +91,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -94,13 +101,13 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+        _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -111,13 +118,13 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+        _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened);
     }
 
     CLTensor *anchors_to_use = &_all_anchors;
@@ -129,18 +136,18 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
         _memory_group.manage(&_all_anchors_f32);
         _memory_group.manage(&_deltas_flattened_f32);
         // Dequantize anchors to float
-        _dequantize_anchors.configure(&_all_anchors, &_all_anchors_f32);
+        _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32);
         _all_anchors.allocator()->allocate();
         anchors_to_use = &_all_anchors_f32;
         // Dequantize deltas to float
-        _dequantize_deltas.configure(&_deltas_flattened, &_deltas_flattened_f32);
+        _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
         _deltas_flattened.allocator()->allocate();
         deltas_to_use = &_deltas_flattened_f32;
     }
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
@@ -150,7 +157,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
         _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
-        _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
+        _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
     }
@@ -185,7 +192,7 @@ void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTenso
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
 }
 
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 09314439a8..0645cfdf22 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,11 @@ CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
+}
+
+void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -76,16 +81,16 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG
     _memory_group.manage(&_phase);
 
     // Initialise gradient kernel
-    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+    _gradient.configure(compile_context, input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
     // Manage intermediate buffers
     _memory_group.manage(&_hog_space);
 
     // Initialise orientation binning kernel
-    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+    _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
 
     // Initialize HOG norm kernel
-    _block_norm.configure(&_hog_space, output, hog->info());
+    _block_norm.configure(compile_context, &_hog_space, output, hog->info());
 
     // Allocate intermediate tensors
     _mag.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
index 8eb5e4251f..bf9bae1e8b 100644
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,12 @@ CLHOGDetector::CLHOGDetector()
 }
 
 void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
+                              float threshold, size_t idx_class)
 {
     _detection_windows = detection_windows;
 
@@ -44,7 +50,7 @@ void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDete
     _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
 
     // Configure HOGDetectorKernel
-    _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+    _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
 }
 
 void CLHOGDetector::run()
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index e509fd8e36..acf5f2c568 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,12 @@ CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_magnitude, output_phase, phase_type, border_mode, constant_border_value);
+}
+
+void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
+                              uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
@@ -52,16 +58,16 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL
     _memory_group.manage(&_gy);
 
     // Initialise derivate kernel
-    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+    _derivative.configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
 
     // Initialise magnitude/phase kernel
     if(PhaseType::UNSIGNED == phase_type)
     {
-        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
     }
     else
     {
-        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
     }
 
     // Allocate intermediate tensors
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 54ad1b35bf..248f7307e6 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -54,6 +54,14 @@ CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_
 
 void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
                                     uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, multi_hog, detection_windows, detection_window_strides, border_mode, constant_border_value, threshold, non_maxima_suppression,
+              min_distance);
+}
+
+void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows,
+                                    ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
@@ -145,7 +153,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
     _memory_group.manage(&_phase);
 
     // Initialise gradient kernel
-    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+    _gradient_kernel.configure(compile_context, input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
     // Configure NETensor for the HOG space and orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
@@ -173,7 +181,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
         _memory_group.manage(&_hog_space[i]);
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(&_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -194,7 +202,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
         _memory_group.manage(&_hog_norm_space[i]);
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(&_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -210,7 +218,7 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
     {
         const size_t idx_block_norm = input_hog_detect[i];
 
-        _hog_detect_kernel[i].configure(&_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+        _hog_detect_kernel[i].configure(compile_context, &_hog_norm_space[idx_block_norm], multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
     }
 
     detection_window_strides->unmap(CLScheduler::get().queue());
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 3cae95f551..aecec0d3c5 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -64,6 +64,13 @@ CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager)
 void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
                                 BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, threshold, min_dist, sensitivity, gradient_size, block_size, corners, border_mode, constant_border_value, use_fp16);
+}
+
+void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
 {
     ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -96,21 +103,21 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist
         case 3:
         {
             auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 5:
         {
             auto k = arm_compute::support::cpp14::make_unique<CLSobel5x5>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
         case 7:
         {
             auto k = arm_compute::support::cpp14::make_unique<CLSobel7x7>();
-            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            k->configure(compile_context, input, &_gx, &_gy, border_mode, constant_border_value);
             _sobel = std::move(k);
             break;
         }
@@ -126,11 +133,11 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist
     _memory_group.manage(&_score);
 
     // Set/init Harris Score kernel accordingly with block_size
-    _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+    _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
 
     // Configure border filling using harris score kernel's block size
-    _border_gx.configure(&_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy.configure(&_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -140,7 +147,7 @@ void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist
     _memory_group.manage(&_nonmax);
 
     // Init non-maxima suppression function
-    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+    _non_max_suppr.configure(compile_context, &_score, &_nonmax, border_mode);
 
     // Allocate intermediate buffers
     _score.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
index eb543387f5..e723024334 100644
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ b/src/runtime/CL/functions/CLHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,13 @@ CLHistogram::CLHistogram()
 
 void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
 {
-    _kernel.configure(input, output);
-    _kernel_border.configure(input, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output)
+{
+    _kernel.configure(compile_context, input, output);
+    _kernel_border.configure(compile_context, input, output);
 }
 
 void CLHistogram::run()
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index e639e74394..273a873c81 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -33,9 +33,14 @@ CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
 }
 
 void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
+}
+
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    k->configure(compile_context, input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
index 2d54be32fa..b3be2f8c2c 100644
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,13 @@ CLIntegralImage::CLIntegralImage()
 
 void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
 {
-    _integral_hor.configure(input, output);
-    _integral_vert.configure(output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _integral_hor.configure(compile_context, input, output);
+    _integral_vert.configure(compile_context, output);
 }
 
 void CLIntegralImage::run()
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 95a62c5317..14c83cd543 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -44,6 +44,11 @@ CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma
 }
 
 void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
+}
+
+void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     // Reset auxiliary tensor
     _sumsq.allocator()->init(TensorInfo());
@@ -53,8 +58,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis
 
     // Configure kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+    _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
+    _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensor
     _sumsq.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 3a3917784b..32ff813f43 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -58,6 +58,19 @@ void CLLSTMLayer::configure(const ICLTensor *input,
                             const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
                             ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
                             const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+              cell_threshold, projection_threshold);
+}
+
+void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
+                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input,
                                  input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -97,7 +110,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_forget_gate_out2);
-    _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2);
+    _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2);
 
     std::vector<const ICLTensor *> weights_vector;
 
@@ -106,10 +119,10 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+    _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
@@ -121,8 +134,8 @@ void CLLSTMLayer::configure(const ICLTensor *input,
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -137,15 +150,16 @@ void CLLSTMLayer::configure(const ICLTensor *input,
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
-        _mean_std_norm_forget_gate.configure(forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -158,8 +172,8 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
+        _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -174,20 +188,20 @@ void CLLSTMLayer::configure(const ICLTensor *input,
         TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
-        _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
 
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
         if(_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -203,15 +217,16 @@ void CLLSTMLayer::configure(const ICLTensor *input,
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
-            _mean_std_norm_input_gate.configure(input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+            _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                      RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -224,14 +239,14 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
-    _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
+    _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
     if(_is_layer_norm_lstm)
     {
@@ -239,27 +254,28 @@ void CLLSTMLayer::configure(const ICLTensor *input,
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
-        _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                 RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
-    _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
+    _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
     if(cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -274,12 +290,12 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
+    _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2);
 
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
@@ -290,8 +306,8 @@ void CLLSTMLayer::configure(const ICLTensor *input,
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
 
@@ -308,15 +324,16 @@ void CLLSTMLayer::configure(const ICLTensor *input,
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
-        _mean_std_norm_output_gate.configure(output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -332,26 +349,26 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_activation);
-    _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
     if(lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
         if(projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
         }
     }
 
     // Copy cell state and output
-    _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
-    _copy_output.configure(output_state_out, output);
+    _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out);
+    _copy_output.configure(compile_context, output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<ICLTensor *> scratch_inputs;
@@ -362,7 +379,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
     scratch_inputs.emplace_back(&_cell_state_out1);
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
-    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
+    _concat_scratch_buffer.configure(compile_context, scratch_inputs, scratch_buffer, Window::DimX);
     input_gate_out->allocator()->allocate();
     _cell_state_out1.allocator()->allocate();
     forget_gate_out->allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index e5f127825b..c57fcc9f21 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,6 +61,18 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
                                      const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
                                      ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                                      ICLTensor *cell_state_out, ICLTensor *output_state_out)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
+              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+              output_state_out);
+}
+
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
@@ -107,18 +119,18 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
     _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
-    _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
+    _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
     _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
-    _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
+    _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
     _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
-    _concat_weights.configure(weights_vector, &_weights, Window::DimX);
-    _transpose_weights.configure(&_weights, &_weights_transposed);
+    _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
+    _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
     // Input concatenation
     std::vector<const ICLTensor *> input_vector;
@@ -127,7 +139,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
 
     _memory_group.manage(&_input);
     _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
-    _concat_inputs.configure(input_vector, &_input, Window::DimX);
+    _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
     std::vector<const ICLTensor *> bias_vector;
@@ -137,7 +149,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
     bias_vector.emplace_back(output_gate_bias);
 
     _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
-    _concat_bias.configure(bias_vector, &_bias, Window::DimX);
+    _concat_bias.configure(compile_context, bias_vector, &_bias, Window::DimX);
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
@@ -146,7 +158,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
     _output_highp.allocator()->init(TensorInfo(TensorShape(4 * output_size, batch_size), 1, DataType::S32));
-    _gemmlowp.configure(&_input, &_weights_transposed, nullptr, &_output_highp);
+    _gemmlowp.configure(compile_context, &_input, &_weights_transposed, nullptr, &_output_highp);
     _input.allocator()->allocate();
 
     // Set the offset back
@@ -162,7 +174,7 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
     quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
 
     _memory_group.manage(&_output_lowp);
-    _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+    _output_stage.configure(compile_context, &_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
     _output_highp.allocator()->allocate();
     _bias.allocator()->allocate();
 
@@ -170,86 +182,86 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input,
     if(batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
     _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
     _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
     _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
     _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
     _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(&_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
     _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(&_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
     _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
     _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
     _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
-    _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
+    _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
-    _quantize.configure(&_output_state_out_f32, output_state_out);
+    _quantize.configure(compile_context, &_output_state_out_f32, output_state_out);
     _output_state_out_f32.allocator()->allocate();
 }
 
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index c2c04e6002..831f0cdcdf 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -47,6 +47,11 @@ CLLaplacianPyramid::CLLaplacianPyramid() // NOLINT
 }
 
 void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, output, border_mode, constant_border_value);
+}
+
+void CLLaplacianPyramid::configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -67,18 +72,18 @@ void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTens
     _conv_pyr.init(pyramid_info);
 
     // Create Gaussian Pyramid function
-    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+    _gaussian_pyr_function.configure(compile_context, input, &_gauss_pyr, border_mode, constant_border_value);
 
     _convf.resize(_num_levels);
     _subf.resize(_num_levels);
 
     for(unsigned int i = 0; i < _num_levels; ++i)
     {
-        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
-        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+        _convf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(compile_context, _gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
     }
 
-    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+    _depth_function.configure(compile_context, _conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
 
     _gauss_pyr.allocate();
     _conv_pyr.allocate();
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 8d1cd98c91..ea6a3f9a98 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -42,6 +42,11 @@ CLLaplacianReconstruct::CLLaplacianReconstruct() // NOLINT
 }
 
 void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), pyramid, input, output, border_mode, constant_border_value);
+}
+
+void CLLaplacianReconstruct::configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
     ARM_COMPUTE_ERROR_ON(input == output);
@@ -67,17 +72,17 @@ void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *inpu
 
     const size_t last_level = num_levels - 1;
 
-    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+    _addf[last_level].configure(compile_context, input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
 
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t l = 0; l < last_level; ++l)
     {
-        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
-        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
     }
 
     // Convert level 0 from S16 to U8
-    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+    _depthf.configure(compile_context, _tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
 
     _tmp_pyr.allocate();
 }
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 3e99dde253..950be5030f 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,6 +128,12 @@ Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor
 }
 
 void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
+}
+
+void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                        const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
@@ -160,10 +166,10 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
+    _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 9ca9d167eb..a267952d4a 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, mag_type);
+}
+
+void CLMagnitude::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(input1, input2, output, nullptr, mag_type);
+    k->configure(compile_context, input1, input2, output, nullptr, mag_type);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 8517b59e7a..e3ce704bfb 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,6 +65,11 @@ Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
 }
 
 void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, mean, stddev);
+}
+
+void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev)
 {
     // In the case of F16/F32 we call reduction operation for calculating CLMeanStdDev
     _data_type = input->info()->data_type();
@@ -74,14 +79,14 @@ void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
         _num_pixels = input->info()->dimension(0) * input->info()->dimension(1);
 
         _memory_group.manage(&_reduction_output_mean);
-        _reduction_operation_mean.configure(input, &_reduction_output_mean, 0, ReductionOperation::SUM);
+        _reduction_operation_mean.configure(compile_context, input, &_reduction_output_mean, 0, ReductionOperation::SUM);
         _reduction_output_mean.allocator()->allocate();
         _mean = mean;
 
         if(stddev != nullptr)
         {
             _memory_group.manage(&_reduction_output_stddev);
-            _reduction_operation_stddev.configure(input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
+            _reduction_operation_stddev.configure(compile_context, input, &_reduction_output_stddev, 0, ReductionOperation::SUM_SQUARE);
             _reduction_output_stddev.allocator()->allocate();
             _stddev     = stddev;
             _run_stddev = true;
@@ -96,8 +101,8 @@ void CLMeanStdDev::configure(ICLImage *input, float *mean, float *stddev)
             _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
         }
 
-        _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+        _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
+        _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
     }
 }
 
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 9b5e707665..3dbab76c72 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -30,9 +30,14 @@
 namespace arm_compute
 {
 void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float epsilon)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
+}
+
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLMeanStdDevNormalizationKernel>();
-    k->configure(input, output, epsilon);
+    k->configure(compile_context, input, output, epsilon);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 7c5c422822..dc53240f79 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
+}
+
+void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index 49dcbcb7df..15b28330b5 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,13 @@ CLMinMaxLocation::CLMinMaxLocation()
 }
 
 void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
+}
+
+void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc,
+                                 uint32_t *min_count,
+                                 uint32_t *max_count)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == min);
     ARM_COMPUTE_ERROR_ON(nullptr == max);
@@ -55,8 +62,8 @@ void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CL
     _min_loc            = min_loc;
     _max_loc            = max_loc;
 
-    _min_max_kernel.configure(input, &_min_max_vals);
-    _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+    _min_max_kernel.configure(compile_context, input, &_min_max_vals);
+    _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
 }
 
 void CLMinMaxLocation::run()
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index 843a756fea..96912a21cd 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -32,9 +32,15 @@ using namespace arm_compute;
 
 void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
                                   BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, function, mask_size, pattern, mask, border_mode, constant_border_value);
+}
+
+void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern,
+                                  const uint8_t *mask, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
-    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index 2f9c02dbb6..6d4a28db26 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -31,17 +31,22 @@
 using namespace arm_compute;
 
 void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode);
+}
+
+void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
-    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT);
+        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
     }
     else
     {
-        _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED);
+        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
     }
 }
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 8489fab68b..f59a4ca959 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,14 +38,19 @@ CLNormalizationLayer::CLNormalizationLayer()
 }
 
 void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
+}
+
+void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
     // Configure normalization kernel
-    _norm_kernel.configure(input, output, norm_info);
+    _norm_kernel.configure(compile_context, input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 33b993a003..b03de6475b 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -32,9 +32,14 @@
 namespace arm_compute
 {
 void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
+}
+
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLNormalizePlanarYUVLayerKernel>();
-    k->configure(input, output, mean, std);
+    k->configure(compile_context, input, output, mean, std);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 5e49937b5f..5f7c1704ee 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -61,6 +61,15 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new
                               const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
                               Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
                               BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), old_pyramid, new_pyramid, old_points, new_points_estimates, new_points, termination, epsilon, num_iterations, window_dimension,
+              use_initial_estimate, border_mode, constant_border_value);
+}
+
+void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                              BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
     ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
@@ -122,18 +131,18 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new
         _memory_group.manage(&_scharr_gy[i]);
 
         // Init Scharr kernel
-        _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
+        _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
         // Init Lucas-Kanade init kernel
-        _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+        _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
 
         // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+        _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
                                             _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
                                             window_dimension, i);
 
         // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+        _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
                                             termination, epsilon, num_iterations, window_dimension, i);
 
         // Allocate intermediate buffers
@@ -142,7 +151,7 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new
     }
 
     // Finalize Lucas-Kanade
-    _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points);
+    _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points);
 }
 
 void CLOpticalFlow::run()
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index ab4f53d960..6543ab922e 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -31,7 +31,7 @@ namespace arm_compute
 {
 namespace
 {
-void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
 {
     if(output->info()->dimension(0) > 1)
     {
@@ -39,18 +39,23 @@ void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize bor
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
         }
     }
 }
 } // namespace
 
 void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+}
+
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::PRELU, input, alpha, output);
+    k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
     _kernel = std::move(k);
-    configure_border_handler(_border_handler, _kernel->border_size(), input, alpha, output);
+    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output);
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 8f36a69866..078bdbc51f 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,11 @@ CLPadLayer::CLPadLayer()
 }
 
 void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
+}
+
+void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
 
@@ -41,12 +46,12 @@ void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingLis
 
     if(_perform_pad)
     {
-        _pad_kernel.configure(input, output, padding, constant_value, mode);
+        _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode);
     }
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(input, output);
+        _copy_kernel.configure(compile_context, input, output);
     }
 }
 Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index 6b88ef86ac..e6323ce504 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -31,9 +31,14 @@
 namespace arm_compute
 {
 void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
+}
+
+void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>();
-    k->configure(input, output, perm);
+    k->configure(compile_context, input, output, perm);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index 537dda02f4..b915104f38 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, phase_type);
+}
+
+void CLPhase::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
-    k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
+    k->configure(compile_context, input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index b527922d2b..3c1a7de76d 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -33,9 +33,15 @@ namespace arm_compute
 {
 void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
     _kernel = std::move(k);
 
     if(output->info()->dimension(0) > 1)
@@ -44,7 +50,7 @@ void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2,
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -56,9 +62,14 @@ Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen
 }
 
 void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output, act_info);
+    k->configure(compile_context, input1, input2, output, act_info);
     _kernel = std::move(k);
 
     if(output->info()->dimension(0) > 1)
@@ -67,7 +78,7 @@ void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *i
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 9c4fa4a2ba..e7735b00df 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -31,12 +31,17 @@
 namespace arm_compute
 {
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
+}
+
+void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     // Configure pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
     k->set_target(CLScheduler::get().target());
-    k->configure(input, output, pool_info, indices);
+    k->configure(compile_context, input, output, pool_info, indices);
     _kernel = std::move(k);
 
     const DataType data_type = input->info()->data_type();
@@ -74,7 +79,7 @@ void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const Poolin
         default:
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
 
     // Tune kernels
     CLScheduler::get().tune_kernel_static(*_kernel);
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 4f6c969a92..d01b4c711b 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,11 @@ CLPriorBoxLayer::CLPriorBoxLayer()
 }
 
 void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
+}
+
+void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
 {
     _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
     _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
@@ -48,7 +53,7 @@ void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2
     }
 
     auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>();
-    k->configure(input1, input2, output, info, &_min, &_max, &_aspect_ratios);
+    k->configure(compile_context, input1, input2, output, info, &_min, &_max, &_aspect_ratios);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 4b994d47b2..88c5f77b9f 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -51,7 +51,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void CLQLSTMLayer::configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
+void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
                                 const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
                                 CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
                                 const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
@@ -63,11 +63,11 @@ void CLQLSTMLayer::configure_mm(CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutp
     outstage_res->allocator()->init(outstage_tensor_info);
 
     // Configure matrix-multiplication
-    mm.configure(mm_input, mm_weights, nullptr, mm_res);
+    mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
     quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-    outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
+    outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
@@ -78,6 +78,19 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
                              const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                              ICLTensor *cell_state_out, ICLTensor *output_state_out,
                              const LSTMParams<ICLTensor> &lstm_params)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+              cell_state_in, output_state_in, cell_state_out, output_state_out, lstm_params);
+}
+
+void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                             const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                             ICLTensor *cell_state_out, ICLTensor *output_state_out,
+                             const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_UNUSED(forget_gate_bias);
     ARM_COMPUTE_UNUSED(cell_bias);
@@ -133,36 +146,36 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_projection_bias != nullptr)
     {
-        _projection_reduction.configure(_projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true));
-        _projection_bias_add.configure(ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE);
+        _projection_reduction.configure(compile_context, _projection_weights, &_projection_reduction_res, GEMMLowpReductionKernelInfo(num_units, false, lstm_params.hidden_state_zero(), true));
+        _projection_bias_add.configure(compile_context, ArithmeticOperation::ADD, _projection_bias, &_projection_reduction_res, &_projection_eff_bias, ConvertPolicy::SATURATE);
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
     if(!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
     }
     if(_has_projection)
     {
-        _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
+        _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -175,31 +188,33 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
     // Forget gate.
     const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
                  input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
                  &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
     const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                                 ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
     if(_has_peephole)
     {
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
         _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
         const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
         quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                          ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
@@ -209,30 +224,31 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(&_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, &_recurrent_to_forget_outstage_res, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _recurrent_to_forget_outstage_res.allocator()->allocate();
 
     // Modulation gate.
     const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
     const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
                  input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
                  &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
     const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
                  &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+                                                     ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(&_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, &_recurrent_to_cell_outstage_res, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     _recurrent_to_cell_outstage_res.allocator()->allocate();
 
     // Input gate.
@@ -242,75 +258,77 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
     if(_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
-        _input_gate_sub.configure(ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
+        _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
     }
     else
     {
         const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
         const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
                      input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
                      &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
                      mm_out_info, input_outstage_info);
 
         const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
+        configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      input, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+                                                    ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
         if(_has_peephole)
         {
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
             const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
             quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
             _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
             _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
-        _input_gate_tanh.configure(&_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+        _input_gate_tanh.configure(compile_context, &_recurrent_to_input_outstage_res, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
         _recurrent_to_input_outstage_res.allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
-    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
     const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
     if(_has_cell_clipping)
     {
-        _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
     const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
     const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
                  input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
                  &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
                  mm_out_info, output_outstage_info);
 
     const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
+    configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+                                                 ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
     if(_has_peephole)
@@ -320,31 +338,32 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accumulate_cell_to_output.configure(ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_mul_cell_to_output_res, &_recurrent_to_output_outstage_res,
+                                             ConvertPolicy::SATURATE);
         _mul_cell_to_output_res.allocator()->allocate();
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(&_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, &_recurrent_to_output_outstage_res, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _recurrent_to_output_outstage_res.allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
     quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
-    _hidden_outstage.configure(&_hidden_mul_res, nullptr, output_state_out, gemmlowp_info);
+    _hidden_outstage.configure(compile_context, &_hidden_mul_res, nullptr, output_state_out, gemmlowp_info);
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
@@ -358,12 +377,12 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
         gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
 
-        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
                      output_state_out, &_projection_weights_transposed, &_projection_eff_bias,
                      &_mm_projection_res, &_projection_outstage_res, projection_scale,
                      mm_out_info, projection_outstage_info);
 
-        _accumulate_projection.configure(ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, output_state_out, output_state_out, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
         int8_t quantized_projection_clip{ 0 };
@@ -374,7 +393,8 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
 
         if(quantized_projection_clip > 0)
         {
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
+                                                                                                       quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index cc78ccede3..6239f279ea 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -29,9 +29,14 @@
 namespace arm_compute
 {
 void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLQuantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLQuantizationLayerKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index e839a6ba21..57b8d70089 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -67,6 +67,13 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
 
 void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
                            ActivationLayerInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+}
+
+void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
+                           ICLTensor *hidden_state,
+                           ICLTensor *output, ActivationLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
@@ -81,23 +88,23 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con
 
     // Manage intermediate buffers and configure
     _memory_group.manage(&_fully_connected_out);
-    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+    _fully_connected_kernel.configure(compile_context, input, weights, bias, &_fully_connected_out);
 
     _memory_group.manage(&_gemm_output);
-    _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+    _gemm_state_f.configure(compile_context, hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
 
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _activation_kernel.configure(compile_context, &_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(hidden_state, output);
+    _copy_kernel.configure(compile_context, hidden_state, output);
 }
 
 void CLRNNLayer::run()
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index f0044e20a0..43b58ddb9b 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -37,10 +37,15 @@ Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *ro
 }
 
 void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
+}
+
+void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();
-    k->configure(input, rois, output, pool_info);
+    k->configure(compile_context, input, rois, output, pool_info);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 6780907155..bb54cfa2ca 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -31,9 +31,14 @@
 using namespace arm_compute;
 
 void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
+}
+
+void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
-    k->configure(input, rois, output, pool_info);
+    k->configure(compile_context, input, rois, output, pool_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 5dedcc081c..b29b03d5b5 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -33,10 +33,15 @@
 using namespace arm_compute;
 
 void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
+}
+
+void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
     k->set_target(CLScheduler::get().target());
-    k->configure(output, start, end, step);
+    k->configure(compile_context, output, start, end, step);
     _kernel = std::move(k);
 
     // Tune kernels
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 3b7a7f873a..3ddaa00d4b 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -92,6 +92,11 @@ CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
+}
+
+void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
@@ -118,13 +123,13 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
             _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
@@ -147,7 +152,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
             out_shape.remove_dimension(axis_local[i] - i);
         }
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+        _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output);
     }
 }
 
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 77168998f8..b659ecfaf6 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -190,6 +190,11 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor
 }
 
 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
+}
+
+void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _op                  = op;
@@ -218,7 +223,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[0].configure(input, output_internal, axis, op, 0);
+        _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
     }
     else
     {
@@ -318,15 +323,15 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
-        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
+        _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
             _results_vector[i - 1].allocator()->allocate();
         }
 
@@ -339,14 +344,14 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
 
     if(_is_reshape_required)
     {
-        _reshape_kernel.configure(&_results_vector.back(), output);
+        _reshape_kernel.configure(compile_context, &_results_vector.back(), output);
         _results_vector.back().allocator()->allocate();
     }
 }
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index 9e4529b1dc..af241ec299 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -36,6 +36,13 @@
 using namespace arm_compute;
 
 void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, map_x, map_y, output, policy, border_mode, constant_border_value);
+}
+
+void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
+                        BorderMode border_mode,
+                        uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -44,7 +51,7 @@ void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTenso
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
     auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
-    k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 547d0fc483..ea9331414c 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -35,9 +35,14 @@
 using namespace arm_compute;
 
 void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
+}
+
+void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>();
-    k->configure(input, output, stride);
+    k->configure(compile_context, input, output, stride);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 3325d2c234..13baedb3f9 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -31,9 +31,14 @@
 using namespace arm_compute;
 
 void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 3f7a2708bb..3c8bc15a54 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -30,9 +30,14 @@
 namespace arm_compute
 {
 void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+}
+
+void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
-    k->configure(input, output, axis);
+    k->configure(compile_context, input, output, axis);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 9ec8c44836..a9395bdc3d 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -34,11 +34,17 @@ using namespace arm_compute;
 
 void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
                         bool align_corners)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners);
+}
+
+void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
+                        SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
 {
     ARM_COMPUTE_UNUSED(use_padding);
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->set_target(CLScheduler::get().target());
-    k->configure(input, output, policy, border_mode, sampling_policy, align_corners);
+    k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners);
     _kernel = std::move(k);
 
     // Tune kernels
@@ -50,7 +56,7 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy
     {
         border_mode = BorderMode::CONSTANT;
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value);
 }
 
 Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index df96ce83ba..faad5424a2 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
-    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index 90c368e9b8..7187010448 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,9 +32,14 @@ using namespace arm_compute;
 namespace arm_compute
 {
 void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
+}
+
+void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
-    k->configure(c, x, y, output);
+    k->configure(compile_context, c, x, y, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index d8e8e7e140..e8cc0f5499 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -32,6 +32,11 @@
 namespace arm_compute
 {
 void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
+}
+
+void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -39,7 +44,7 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
 
     auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+    k->configure(compile_context, input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index e362ebf8ba..c3604f970f 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -32,9 +32,14 @@
 using namespace arm_compute;
 
 void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
-    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index 22fbef17eb..f8a33f3fb6 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,11 @@ CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
 
@@ -53,8 +58,8 @@ void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -62,19 +67,19 @@ void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel5x5::run()
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 9b38f6928f..6d3c7f0d08 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,11 @@ CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
 }
 
 void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
+}
+
+void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
 
@@ -53,8 +58,8 @@ void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -62,19 +67,19 @@ void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *out
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel7x7::run()
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index e01d2c75ca..b0b2117cd9 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,12 @@ CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryMana
 
 template <bool IS_LOG>
 void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+{
+    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+}
+
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis)
 {
     // Flatten the input
     const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -56,13 +62,13 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTens
     if(axis != 3)
     {
         auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
-        reshape_kernel_ptr->configure(input, &_input_flattened);
+        reshape_kernel_ptr->configure(compile_context, input, &_input_flattened);
         _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
     }
     else
     {
         auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
-        flatten_kernel_ptr->configure(input, &_input_flattened);
+        flatten_kernel_ptr->configure(compile_context, input, &_input_flattened);
         _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
     }
 
@@ -73,6 +79,12 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTens
 
 template <bool IS_LOG>
 void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis);
+}
+
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -123,7 +135,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
     softmax_info.input_data_type = input_2D->info()->data_type();
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info);
+    _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info);
 
     if(_needs_flattening)
     {
@@ -131,10 +143,10 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
         _memory_group.manage(&_output_flattened);
 
         // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info);
+        _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
 
         // Reshape the flat output into a the requested (4D) output
-        _reshape_kernel.configure(&_output_flattened, output);
+        _reshape_kernel.configure(compile_context, &_output_flattened, output);
 
         // Allocate the intermediate flat tensors
         _input_flattened.allocator()->allocate();
@@ -143,7 +155,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
     else
     {
         // Softmax 2D case
-        _norm_kernel.configure(&_tmp, &_sum, output, softmax_info);
+        _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info);
     }
 
     // Allocate intermediate buffers
@@ -203,7 +215,7 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::run()
+void           CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index fa6e82efb0..021d31649d 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,27 +38,38 @@ CLSpaceToBatchLayer::CLSpaceToBatchLayer()
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
+}
+
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+    _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output);
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
+                                    const Size2D &padding_right, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index f02a13f66d..a4ffefc189 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,12 @@ CLSpaceToDepthLayer::CLSpaceToDepthLayer()
 
 void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
-    _space_to_depth_kernel.configure(input, output, block_shape);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
+}
+
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+{
+    _space_to_depth_kernel.configure(compile_context, input, output, block_shape);
 }
 
 Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 607445af4a..79c3fe5371 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -43,6 +43,11 @@ CLStackLayer::CLStackLayer() // NOLINT
 }
 
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
+}
+
+void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     _num_inputs = input.size();
     _stack_kernels.resize(_num_inputs);
@@ -52,7 +57,7 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output);
     }
 }
 
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index e217906870..454759664c 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -32,9 +32,16 @@ namespace arm_compute
 void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 35f939f873..47e15d3c12 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -31,8 +31,13 @@
 using namespace arm_compute;
 
 void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, lut, output);
+}
+
+void CLTableLookup::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLTableLookupKernel>();
-    k->configure(input, lut, output);
+    k->configure(compile_context, input, lut, output);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index a655783498..57c92724fa 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -31,8 +31,14 @@
 using namespace arm_compute;
 
 void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
+}
+
+void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type,
+                            uint8_t upper)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
-    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index e45d88d867..178d7af95e 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -29,9 +29,14 @@
 namespace arm_compute
 {
 void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
+}
+
+void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
-    k->configure(input, output, multiples);
+    k->configure(compile_context, input, output, multiples);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index aa912303db..f5121d06a5 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -31,9 +31,14 @@
 using namespace arm_compute;
 
 void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLTranspose::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
-    k->configure(input, output);
+    k->configure(compile_context, input, output);
     _kernel = std::move(k);
 }
 
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index eb1dd8cd44..032fb993d0 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,6 +60,11 @@ CLUnstack::CLUnstack() // NOLINT
 }
 
 void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
+}
+
+void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
 {
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
     std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
@@ -83,7 +88,7 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
index 1dad3250a2..dd04686d60 100644
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,11 +43,17 @@ Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *ou
 
 void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output,
                                 const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+                                const Size2D &info, const InterpolationPolicy upsampling_policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _upsample.configure(input, _output, info, upsampling_policy);
+    _upsample.configure(compile_context, input, _output, info, upsampling_policy);
 }
 
 void CLUpsampleLayer::run()
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index 08c22dba2b..ce2171b3d4 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -32,9 +32,15 @@
 using namespace arm_compute;
 
 void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
+}
+
+void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
+                             uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
-    k->configure(input, output, matrix, policy);
+    k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index b5bc4faba3..06c06616d0 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -32,9 +32,15 @@
 using namespace arm_compute;
 
 void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, matrix, policy, border_mode, constant_border_value);
+}
+
+void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
+                                  uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
-    k->configure(input, output, matrix, policy);
+    k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index a5db977371..132c3ee926 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,6 +97,13 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
 
 void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
                                            bool enable_fast_math)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+}
+
+void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+                                           const PadStrideInfo &conv_info,
+                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Get indices for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
@@ -129,17 +136,18 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
     // Do not manage _input1 as it contains the weights
 
     // Configure input transform
-    _input_transform.configure(input, &_input0, winograd_info);
+    _input_transform.configure(compile_context, input, &_input0, winograd_info);
 
     // Configure filter transform
-    _filter_transform.configure(weights, &_input1, winograd_info);
+    _filter_transform.configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(),
-                                                                                                 (input->info()->data_type() == DataType::F16)));
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                                                                                                  GEMMLowpOutputStageInfo(),
+                                                                                                                  (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
+    _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 55eccf4765..ae400768fe 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -31,11 +31,16 @@
 using namespace arm_compute;
 
 void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, winograd_info);
+}
+
+void CLWinogradInputTransform::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
-    k->configure(input, output, winograd_info);
+    k->configure(compile_context, input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
index b2b84d9dc1..0c0c1065bc 100644
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp
@@ -30,9 +30,14 @@
 using namespace arm_compute;
 
 void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info, num_classes);
+}
+
+void CLYOLOLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>();
-    k->configure(input, output, act_info, num_classes);
+    k->configure(compile_context, input, output, act_info, num_classes);
     _kernel = std::move(k);
 }
 
-- 
cgit v1.2.1