From 68dd25fbe6e4d3c3513fa5993863419769aa08fc Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Mon, 19 Oct 2020 16:00:11 +0100
Subject: COMPMID-3637: Move utility headers from arm_compute to src

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: If9d6fa8c900b68c4b6fd373f2fc1f9abb83ea917
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4145
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/runtime/CL/CLHelpers.cpp                       |  3 +-
 src/runtime/CL/CLMemory.cpp                        |  4 +-
 src/runtime/CL/CLRuntimeContext.cpp                |  2 +
 src/runtime/CL/CLTensorAllocator.cpp               |  4 +-
 src/runtime/CL/functions/CLArgMinMaxLayer.cpp      | 11 ++--
 src/runtime/CL/functions/CLConcatenateLayer.cpp    |  1 +
 .../functions/CLConvertFullyConnectedWeights.cpp   |  2 +
 src/runtime/CL/functions/CLConvolutionLayer.cpp    |  2 +
 src/runtime/CL/functions/CLCropResize.cpp          |  4 ++
 src/runtime/CL/functions/CLDeconvolutionLayer.cpp  |  2 +
 .../CL/functions/CLDirectConvolutionLayer.cpp      |  2 +-
 .../CL/functions/CLDirectDeconvolutionLayer.cpp    |  1 +
 src/runtime/CL/functions/CLFFT1D.cpp               |  2 +-
 src/runtime/CL/functions/CLFFTConvolutionLayer.cpp |  5 +-
 src/runtime/CL/functions/CLFill.cpp                |  2 +
 src/runtime/CL/functions/CLFullyConnectedLayer.cpp |  2 +-
 src/runtime/CL/functions/CLGEMM.cpp                | 15 +++--
 .../CL/functions/CLGEMMConvolutionLayer.cpp        |  3 +-
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  |  7 ++-
 .../CL/functions/CLGenerateProposalsLayer.cpp      |  1 +
 .../CL/functions/CLInstanceNormalizationLayer.cpp  |  2 +
 src/runtime/CL/functions/CLLSTMLayerQuantized.cpp  |  1 +
 src/runtime/CL/functions/CLPriorBoxLayer.cpp       |  2 +
 src/runtime/CL/functions/CLQLSTMLayer.cpp          |  1 +
 src/runtime/CL/functions/CLReduceMean.cpp          |  3 +-
 src/runtime/CL/functions/CLReductionOperation.cpp  |  8 ++-
 src/runtime/CL/functions/CLRemap.cpp               |  2 +-
 src/runtime/CL/functions/CLSelect.cpp              |  2 +
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |  7 ++-
 src/runtime/CL/functions/CLSplit.cpp               |  1 +
 .../CL/functions/CLWinogradConvolutionLayer.cpp    |  2 +-
 src/runtime/CL/gemm/CLGEMMKernelSelection.h        | 65 ++++++++++++++++++++++
 .../CL/gemm/CLGEMMKernelSelectionBifrost.cpp       |  4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h | 55 ++++++++++++++++++
 .../CL/gemm/CLGEMMKernelSelectionMidgard.cpp       |  4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h | 53 ++++++++++++++++++
 .../CL/gemm/CLGEMMKernelSelectionValhall.cpp       |  4 +-
 src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h | 53 ++++++++++++++++++
 src/runtime/CL/tuners/BifrostTuner.cpp             |  2 +-
 src/runtime/CL/tuners/MidgardTuner.cpp             |  2 +-
 src/runtime/CPP/CPPScheduler.cpp                   |  3 +-
 .../CPP/functions/CPPDetectionOutputLayer.cpp      |  1 +
 .../CPP/functions/CPPDetectionPostProcessLayer.cpp |  1 +
 src/runtime/CPUUtils.cpp                           |  9 ++-
 src/runtime/CPUUtils.h                             | 51 +++++++++++++++++
 src/runtime/DeviceProperties.cpp                   |  6 +-
 src/runtime/GLES_COMPUTE/GCMemory.cpp              |  4 +-
 .../GLES_COMPUTE/functions/GCConcatenateLayer.cpp  |  2 +
 src/runtime/IScheduler.cpp                         | 10 ++--
 src/runtime/NEON/INESimpleFunctionNoBorder.cpp     |  6 +-
 src/runtime/NEON/functions/NEArgMinMaxLayer.cpp    |  4 +-
 .../NEON/functions/NEBatchNormalizationLayer.cpp   |  4 +-
 src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp |  4 +-
 src/runtime/NEON/functions/NEConcatenateLayer.cpp  |  1 +
 src/runtime/NEON/functions/NECropResize.cpp        |  2 +
 .../NEON/functions/NEDeconvolutionLayer.cpp        |  1 +
 src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp |  4 +-
 src/runtime/NEON/functions/NEFFT1D.cpp             |  4 +-
 .../NEON/functions/NEFFTConvolutionLayer.cpp       |  7 ++-
 .../NEON/functions/NEFullyConnectedLayer.cpp       |  2 +
 src/runtime/NEON/functions/NEGEMM.cpp              |  3 +-
 .../NEON/functions/NEGEMMAssemblyDispatch.cpp      | 10 ++--
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     |  1 +
 .../NEON/functions/NEGenerateProposalsLayer.cpp    |  1 +
 .../NEON/functions/NELSTMLayerQuantized.cpp        |  3 +-
 src/runtime/NEON/functions/NEPadLayer.cpp          |  1 +
 src/runtime/NEON/functions/NEPriorBoxLayer.cpp     |  4 +-
 src/runtime/NEON/functions/NEQLSTMLayer.cpp        |  1 +
 src/runtime/NEON/functions/NEReduceMean.cpp        |  3 +-
 .../NEON/functions/NEReductionOperation.cpp        |  1 +
 src/runtime/NEON/functions/NEScale.cpp             |  4 +-
 .../NEON/functions/NESimpleAssemblyFunction.cpp    |  4 +-
 .../NEON/functions/NESimpleAssemblyFunction.h      | 56 +++++++++++++++++++
 src/runtime/NEON/functions/NESoftmaxLayer.cpp      |  7 ++-
 .../NEON/functions/NEWinogradConvolutionLayer.cpp  |  4 +-
 .../NEDepthwiseConvolutionAssemblyDispatch.cpp     | 11 ++--
 src/runtime/OMP/OMPScheduler.cpp                   |  2 +-
 src/runtime/SchedulerUtils.cpp                     |  4 ++
 src/runtime/SchedulerUtils.h                       | 45 +++++++++++++++
 src/runtime/Utils.cpp                              |  7 ++-
 src/runtime/Utils.h                                | 60 ++++++++++++++++++++
 81 files changed, 618 insertions(+), 88 deletions(-)
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelection.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
 create mode 100644 src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
 create mode 100644 src/runtime/CPUUtils.h
 create mode 100644 src/runtime/NEON/functions/NESimpleAssemblyFunction.h
 create mode 100644 src/runtime/SchedulerUtils.h
 create mode 100644 src/runtime/Utils.h

(limited to 'src/runtime')

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index adfdc3c917..5f1842f76d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index efbc68f50e..a1743c56e6 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 2fc7f93adf..571e30931c 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 90d77883f6..f37fc779fe 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index ad6e7ba97b..57c4f685f6 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -24,13 +24,14 @@
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -47,7 +48,7 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
     DataType   output_data_type = DataType::S32;
     TensorInfo not_reshaped_output;
@@ -115,7 +116,7 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
 void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages  = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages  = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis = axis;
 
     const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
@@ -172,4 +173,4 @@ void CLArgMinMaxLayer::run()
     }
     _reshape.run();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 4214813446..2eb310b893 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -36,6 +36,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 4c787673b5..b291ae5b88 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -23,6 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 630352e4e6..85355f0f17 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 529f7bfb3e..6167e9de0a 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,6 +25,10 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index cd55336d9a..e6717b6d01 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <cmath>
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index c1055dda36..07e7a18941 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -43,7 +43,7 @@ void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weig
 }
 
 void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info,
+                                         const PadStrideInfo       &conv_info,
                                          const ActivationLayerInfo &act_info)
 {
     // Set GPU target
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 3515c25d82..0ffafa0221 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 7d15d33ab5..1269cba90d 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index 1def674bb6..4d0eab81ee 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -26,10 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6c0f1786f0..a89383ec31 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "support/MemorySupport.h"
+
 #include <utility>
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 4f365b6a61..75e87c382b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/Cast.h"
 #include "support/MemorySupport.h"
 
 #include <algorithm>
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index d56b341abf..ccae6713a6 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,10 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -35,12 +32,18 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "support/Cast.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ee90b39c2b..e871b39805 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,11 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/Cast.h"
 
 #include <cmath>
 #include <memory>
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 30dce5b8fe..7a8de6c1f5 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -35,7 +33,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 45dc402449..5291de074a 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index fce1fe43a2..4a60ee9d08 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer()
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index e30b1dbb86..76a531b1c9 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 1907c7cc08..fefbff639d 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 CLPriorBoxLayer::CLPriorBoxLayer()
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 15a54c7928..c493471667 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 0e2ede7167..4ea7f7642f 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,12 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 54e91fb8d8..208371c45d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -30,7 +30,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -47,7 +49,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    const unsigned int num_of_stages       = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
     const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
     const bool         is_reshape_required = !keep_dims;
 
@@ -194,7 +196,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
     _is_reshape_required = !keep_dims;
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index 60b72c5f87..1e3d614402 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -42,7 +42,7 @@ void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTenso
 
 void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
                         BorderMode border_mode,
-                        uint8_t constant_border_value)
+                        uint8_t    constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c7d7df75d2..ef8010847b 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 720f9111a5..759c8706a1 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
 
 namespace arm_compute
 {
@@ -63,7 +64,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     {
         _memory_group.manage(&_input_permuted);
         _memory_group.manage(&_output_permuted);
-        _permute_input.configure(compile_context, input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(compile_context, input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
         tmp_output = &_output_permuted;
     }
 
@@ -99,7 +100,7 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     _sum.allocator()->allocate();
     if(_needs_permute)
     {
-        _permute_output.configure(compile_context, &_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(compile_context, &_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
         _input_permuted.allocator()->allocate();
         _output_permuted.allocator()->allocate();
     }
@@ -117,7 +118,7 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     const bool   needs_permute = actual_axis != 0;
     if(needs_permute)
     {
-        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
         TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &input_permuted, permutation_vector));
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index db0b14b9a2..0b27371e3f 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 09a35a6f27..7ad017f918 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -102,7 +102,7 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 }
 
 void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info,
+                                           const PadStrideInfo       &conv_info,
                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Get indices for the width and height
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
new file mode 100644
index 0000000000..f6fad7e4ff
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTION_H
+#define SRC_CLGEMMKERNELSELECTION_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMKernelSelection factory class */
+class CLGEMMKernelSelectionFactory final
+{
+public:
+    /** Static method to select the GEMM kernel accordingly with the GPU target and GEMM's dimensionality
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMKernelSelection class
+     */
+    static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
+    {
+        switch(get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionMidgard>(gpu);
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionBifrost>(gpu);
+            case GPUTarget::VALHALL:
+                return support::cpp14::make_unique<CLGEMMKernelSelectionValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTION_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
index b1dd690ca5..73b90568f5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
new file mode 100644
index 0000000000..a495b48301
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONBIFROST_H
+#define SRC_CLGEMMKERNELSELECTIONBIFROST_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionBifrost final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONBIFROST_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
index 324c2f7dca..d172a827b5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
new file mode 100644
index 0000000000..3f6003f7dc
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONMIDGARD_H
+#define SRC_CLGEMMKERNELSELECTIONMIDGARD_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionMidgard final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionMidgard(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONMIDGARD_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
index c50c7ae76b..acae0e7565 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
new file mode 100644
index 0000000000..cbea9ea548
--- /dev/null
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CLGEMMKERNELSELECTIONVALHALL_H
+#define SRC_CLGEMMKERNELSELECTIONVALHALL_H
+
+#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Valhall based OpenCL GEMMKernel selection */
+class CLGEMMKernelSelectionValhall final : public ICLGEMMKernelSelection
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMKernelSelectionValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+
+private:
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /* SRC_CLGEMMKERNELSELECTIONVALHALL_H */
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 52644bf192..a6474c9835 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
index e49e15508b..58b0d579d2 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index f017006de7..e6b0ec20b8 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,7 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
+#include "support/MemorySupport.h"
 #include "support/Mutex.h"
 
 #include <atomic>
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 9d62733384..fdb4c9f0f6 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 3507a3ac45..31f1fafd69 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 4d6caaee01..a7dd464540 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/Error.h"
@@ -352,6 +352,10 @@ int get_max_cpus()
 
 namespace arm_compute
 {
+namespace utils
+{
+namespace cpu
+{
 void get_cpu_configuration(CPUInfo &cpuinfo)
 {
 #if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
@@ -460,5 +464,6 @@ unsigned int get_threads_hint()
 
     return num_threads_hint;
 }
-
+} // namespace cpu
+} // namespace utils
 } // namespace arm_compute
diff --git a/src/runtime/CPUUtils.h b/src/runtime/CPUUtils.h
new file mode 100644
index 0000000000..452d3d58ca
--- /dev/null
+++ b/src/runtime/CPUUtils.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_CPU_UTILS_H
+#define ARM_COMPUTE_RUNTIME_CPU_UTILS_H
+
+namespace arm_compute
+{
+class CPUInfo;
+
+namespace utils
+{
+namespace cpu
+{
+/** This function will try to detect the CPU configuration on the system and will fill
+ *  the cpuinfo object accordingly to reflect this.
+ *
+ * @param[out] cpuinfo @ref CPUInfo to be used to hold the system's cpu configuration.
+ */
+void get_cpu_configuration(CPUInfo &cpuinfo);
+/** Some systems have both big and small cores, this fuction computes the minimum number of cores
+ *  that are exactly the same on the system. To maximize performance the library attempts to process
+ *  workloads concurrently using as many threads as big cores are available on the system.
+ *
+ * @return The minumum number of common cores.
+ */
+unsigned int get_threads_hint();
+} // namespace cpu
+} // namespace utils
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_CPU_UTILS_H */
diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/DeviceProperties.cpp
index 5d7ae020d7..ec9f4a16ed 100644
--- a/src/runtime/DeviceProperties.cpp
+++ b/src/runtime/DeviceProperties.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/runtime/DeviceProperties.h"
 
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 namespace arm_compute
 {
 DeviceProperties::DeviceProperties()
 {
-    get_cpu_configuration(cpu_info);
+    utils::cpu::get_cpu_configuration(cpu_info);
 }
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index 998f8a5cc4..4d74555f4e 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
index 9e23974b8d..807412eb17 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 53df3699b0..43df3d5e23 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CPUUtils.h"
-#include "arm_compute/runtime/SchedulerUtils.h"
+#include "src/runtime/CPUUtils.h"
+#include "src/runtime/SchedulerUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
     : _cpu_info()
 {
-    get_cpu_configuration(_cpu_info);
+    utils::cpu::get_cpu_configuration(_cpu_info);
     // Work out the best possible number of execution threads
-    _num_threads_hint = get_threads_hint();
+    _num_threads_hint = utils::cpu::get_threads_hint();
 }
 
 CPUInfo &IScheduler::cpu_info()
@@ -74,7 +74,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensor
 
         //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
         unsigned m_threads, n_threads;
-        std::tie(m_threads, n_threads) = split_2d(this->num_threads(), m, n);
+        std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
 
         std::vector<IScheduler::Workload> workloads;
         for(unsigned int ni = 0; ni != n_threads; ++ni)
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 82316c49c6..f2181e0a74 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -36,6 +36,6 @@ INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
 
 void INESimpleFunctionNoBorder::run()
 {
-    schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
+    utils::schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 0664d3c9d5..70bbba62ad 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index 5a593e9c74..eab40ac5be 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 using namespace arm_compute;
 
 NEBatchNormalizationLayer::NEBatchNormalizationLayer()
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index c06a8aa0e0..2705cffe68 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 8df4f4cb62..72bd9e6b19 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -35,6 +35,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index f6ed2ec250..f8f99169aa 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -25,6 +25,8 @@
 
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
 
+#include "support/MemorySupport.h"
+
 #include <cstddef>
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index dff3070239..cb9ab168a7 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index e363f89482..0aaa37ec92 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 744a91521f..2c53b185df 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index cd68788145..a46fc9f45f 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 4dcf41e360..d956d16f4d 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -30,6 +30,8 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <algorithm>
 #include <cmath>
 
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 3b8ca44ed7..4166cff97a 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -34,6 +33,8 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index ad349cb635..5b0848398d 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
-#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
 #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
+
+#include "support/MemorySupport.h"
 
 #include <arm_neon.h>
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 83db146a8a..36357dde41 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -33,6 +33,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 3d5377892a..13210a06cd 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 11989d3225..7610d15787 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 #include <memory>
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 21c349ba95..03c597a3bf 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -27,6 +27,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index fda130bf69..bcf6bef9c7 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 5a6b51337a..95f20ae1a9 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 021f7b530a..c3c5529c09 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,11 +23,12 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 91176bfa45..4938a56b3f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 2278f07a1c..bbf8343c2b 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -30,12 +30,14 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
 #include "src/core/utils/ScaleUtils.h"
 
+#include "support/MemorySupport.h"
+#include "support/Rounding.h"
+
 #include <cmath>
 #include <cstddef>
 #include <utility>
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
index b0cafae520..d165b2235c 100644
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+#include "src/runtime/NEON/functions/NESimpleAssemblyFunction.h"
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
new file mode 100644
index 0000000000..e9be54d35f
--- /dev/null
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
+#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
+
+#include "arm_compute/runtime/IFunction.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
+class NESimpleAssemblyFunction : public IFunction
+{
+public:
+    /** Constructor */
+    NESimpleAssemblyFunction();
+
+    /** Configure the function with the kernel to run
+     *
+     * @param[in] kernel GEMM Wrapper kernel configured and ready to run
+     *
+     * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
+     */
+    void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
+};
+} //namespace arm_compute
+#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index e763caa3a3..4f773861d2 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
 
 namespace arm_compute
 {
@@ -53,7 +54,7 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
         // Add to the memory manager _input_permuted
         _memory_group.manage(&_input_permuted);
 
-        _permute_input.configure(input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
     // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -87,7 +88,7 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
         _input_permuted.allocator()->allocate();
 
         // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(&_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
 
         // Allocate the intermediate permuted tensors
         _output_permuted.allocator()->allocate();
@@ -128,7 +129,7 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 
     if(needs_permute)
     {
-        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
         TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &input_permuted, permutation_vector));
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 1bad310640..23b9f60c38 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -23,17 +23,17 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "support/MemorySupport.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 73a7caac8b..11e89cb23b 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -24,18 +24,21 @@
 
 #include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <set>
 
 namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 4c2f03a53a..bf34b0114b 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 #include <omp.h>
 
 namespace arm_compute
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 1c12e3ce58..6f9a32c879 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/runtime/SchedulerUtils.h"
 
 #include "arm_compute/core/Error.h"
 
@@ -28,6 +29,8 @@
 
 namespace arm_compute
 {
+namespace scheduler_utils
+{
 #ifndef BARE_METAL
 std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
 {
@@ -76,4 +79,5 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
     }
 }
 #endif /* #ifndef BARE_METAL */
+} // namespace scheduler_utils
 } // namespace arm_compute
diff --git a/src/runtime/SchedulerUtils.h b/src/runtime/SchedulerUtils.h
new file mode 100644
index 0000000000..46644a369e
--- /dev/null
+++ b/src/runtime/SchedulerUtils.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_COMPUTE_SCHEDULER_UTILS_H
+#define SRC_COMPUTE_SCHEDULER_UTILS_H
+
+#include <cstddef>
+#include <utility>
+
+namespace arm_compute
+{
+namespace scheduler_utils
+{
+/** Given two dimensions and a maximum number of threads to utilise, calculate the best
+ * combination of threads that fit in (multiplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n);
+} // namespace scheduler_utils
+} // namespace arm_compute
+#endif /* SRC_COMPUTE_SCHEDULER_UTILS_H */
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 534b421f8a..15e9d43a49 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+namespace utils
+{
 #ifndef DOXYGEN_SKIP_THIS
 static const std::string information =
 #include "arm_compute_version.embed"
@@ -78,4 +80,5 @@ unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, un
     const unsigned int num_of_stages = num_of_wg / 128 + 2;
     return num_of_stages;
 }
+} // namespace utils
 } // namespace arm_compute
diff --git a/src/runtime/Utils.h b/src/runtime/Utils.h
new file mode 100644
index 0000000000..f8775c9612
--- /dev/null
+++ b/src/runtime/Utils.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_UTILS_H
+#define SRC_RUNTIME_UTILS_H
+
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace utils
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+
+/** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow.
+ *
+ * @param[in] ctx    Context to use.
+ * @param[in] kernel Kernel to schedule.
+ * @param[in] hints  Hints to use.
+ */
+void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints);
+
+/** Calculate number of stages for parallel implementations
+ *
+ * @param[in] input_x_dimension input tensor x dimension
+ * @param[in] axis              axis to be used
+ */
+unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis);
+} // namespace utils
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_UTILS_H */
-- 
cgit v1.2.1