From 658039bc4e06be34272eccf559a516a6b52f75f5 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 15 Sep 2017 16:30:50 +0100
Subject: COMPMID-534: Add MemoryManager support in NEON functions

Adds support for:
-NECannyEdge
-NEConvolution
-NEDirectConvolution
-NEGEMM
-NEGEMMLowp
-NEGaussian5x5
-NEHOGDescriptor
-NEHOGGradient
-NEL2Normalize
-NELocallyConnectedLayer
-NENormalizationLayer
-NEScale
-NESobel5x5
-NESobel7x7

Change-Id: I68e05aa6054372fa873a882633a15fb97882c00d
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87926
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
---
 arm_compute/runtime/NEON/functions/NECannyEdge.h   |  5 ++-
 arm_compute/runtime/NEON/functions/NEConvolution.h |  6 +++-
 .../NEON/functions/NEDirectConvolutionLayer.h      |  7 +++-
 arm_compute/runtime/NEON/functions/NEGEMM.h        |  7 +++-
 arm_compute/runtime/NEON/functions/NEGEMMLowp.h    |  7 +++-
 arm_compute/runtime/NEON/functions/NEGaussian5x5.h |  6 +++-
 .../runtime/NEON/functions/NEHOGDescriptor.h       |  7 +++-
 arm_compute/runtime/NEON/functions/NEHOGGradient.h |  5 ++-
 .../runtime/NEON/functions/NEHOGMultiDetection.h   |  7 +++-
 arm_compute/runtime/NEON/functions/NEL2Normalize.h |  7 +++-
 .../NEON/functions/NELocallyConnectedLayer.h       |  7 +++-
 .../runtime/NEON/functions/NENormalizationLayer.h  |  8 +++--
 arm_compute/runtime/NEON/functions/NEOpticalFlow.h |  5 ++-
 arm_compute/runtime/NEON/functions/NEScale.h       | 23 +++++++++----
 arm_compute/runtime/NEON/functions/NESobel5x5.h    |  6 +++-
 arm_compute/runtime/NEON/functions/NESobel7x7.h    |  6 +++-
 scripts/clang_tidy_rules.py                        |  1 +
 src/runtime/NEON/functions/NECannyEdge.cpp         | 32 ++++++++++++++----
 src/runtime/NEON/functions/NEConvolution.cpp       | 12 +++++--
 .../NEON/functions/NEDirectConvolutionLayer.cpp    | 11 +++++--
 src/runtime/NEON/functions/NEGEMM.cpp              | 12 +++++--
 src/runtime/NEON/functions/NEGEMMLowp.cpp          | 12 +++++--
 src/runtime/NEON/functions/NEGaussian5x5.cpp       | 11 +++++--
 src/runtime/NEON/functions/NEHOGDescriptor.cpp     | 15 +++++++--
 src/runtime/NEON/functions/NEHOGGradient.cpp       | 13 ++++++--
 src/runtime/NEON/functions/NEHOGMultiDetection.cpp | 37 +++++++++++++++------
 src/runtime/NEON/functions/NEL2Normalize.cpp       | 11 +++++--
 .../NEON/functions/NELocallyConnectedLayer.cpp     | 13 ++++++--
 .../NEON/functions/NENormalizationLayer.cpp        | 11 +++++--
 src/runtime/NEON/functions/NEOpticalFlow.cpp       | 13 ++++++--
 src/runtime/NEON/functions/NEScale.cpp             | 38 +++++++++++++++-------
 src/runtime/NEON/functions/NESobel5x5.cpp          | 13 ++++++--
 src/runtime/NEON/functions/NESobel7x7.cpp          | 13 ++++++--
 33 files changed, 312 insertions(+), 75 deletions(-)
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
index fbf2d90740..b7e0ffbcf1 100644
--- a/arm_compute/runtime/NEON/functions/NECannyEdge.h
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h
@@ -28,6 +28,8 @@
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
@@ -55,7 +57,7 @@ public:
      *
      * Initialize Sobel kernel to nullptr.
      */
-    NECannyEdge();
+    NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NECannyEdge(const NECannyEdge &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -80,6 +82,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                   _memory_group;        /**< Function's memory group */
     std::unique_ptr<IFunction>    _sobel;               /**< Pointer to Sobel kernel */
     std::unique_ptr<INEKernel>    _gradient;            /**< Gradient kernel */
     NEEdgeNonMaxSuppressionKernel _non_max_suppr;       /**< Non-Maxima suppression kernel */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
index 1704d9fa94..9c0a906651 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -28,10 +28,13 @@
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -70,7 +73,7 @@ class NEConvolutionSquare : public IFunction
 {
 public:
     /** Default constructor */
-    NEConvolutionSquare();
+    NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -86,6 +89,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                                   _memory_group;   /**< Function memory group */
     Tensor                                        _tmp;            /**< temporary buffer for output of horizontal pass */
     bool                                          _is_separable;   /**< true if the convolution can be separated */
     NESeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 9d2775ada6..daaf18f297 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -29,8 +29,12 @@
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 /** Function to run the direct convolution.
@@ -45,7 +49,7 @@ class NEDirectConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEDirectConvolutionLayer();
+    NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input, weights, biases and output tensors.
       *
       * @note: DirectConvolution only works in the following configurations:
@@ -69,6 +73,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                                  _memory_group;
     NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
     NEDirectConvolutionLayerKernel               _conv_kernel;
     NEFillBorderKernel                           _input_border_handler;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 3c8d7cf9b7..b4b9e8be01 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -30,8 +30,12 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 /** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
@@ -46,7 +50,7 @@ class NEGEMM : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMM();
+    NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -65,6 +69,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                _memory_group;
     NEGEMMInterleave4x4Kernel  _interleave_kernel;
     NEGEMMTranspose1xWKernel   _transpose_kernel;
     NEGEMMMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
index bfb1a494b8..0b0a7742f6 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
@@ -32,6 +32,10 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +52,7 @@ class NEGEMMLowp : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowp();
+    NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs, output
     *
     * @note GEMM_LOWP:  low precision GEMM kernel
@@ -75,6 +79,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                    _memory_group;
     NEGEMMInterleave4x4Kernel      _interleave_kernel;
     NEGEMMTranspose1xWKernel       _transpose_kernel;
     NEGEMMLowpMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
index 699e42efb4..2aae3cb513 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -28,9 +28,12 @@
 #include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +51,7 @@ class NEGaussian5x5 : public IFunction
 public:
     /** Default constructor
      */
-    NEGaussian5x5();
+    NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's input, output and border mode.
      *
      * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -62,6 +65,7 @@ public:
     void run() override;
 
 protected:
+    MemoryGroup             _memory_group;   /**< Function memory group */
     NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
     NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
     Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
index b7b4909060..30989568e1 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
@@ -26,9 +26,13 @@
 
 #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class IHOG;
@@ -43,7 +47,7 @@ class NEHOGDescriptor : public IFunction
 {
 public:
     /** Default constructor */
-    NEHOGDescriptor();
+    NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destination, HOG data-object and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8
@@ -59,6 +63,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                   _memory_group;
     NEHOGGradient                 _gradient;
     NEHOGOrientationBinningKernel _orient_bin;
     NEHOGBlockNormalizationKernel _block_norm;
diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
index dd2d99adfe..7e268411e1 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGGradient.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
@@ -27,6 +27,8 @@
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -46,7 +48,7 @@ class NEHOGGradient : public IFunction
 {
 public:
     /** Default constructor */
-    NEHOGGradient();
+    NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destinations, phase type and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8.
@@ -63,6 +65,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                _memory_group;
     NEDerivative               _derivative;
     std::unique_ptr<INEKernel> _mag_phase;
     Tensor                     _gx;
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
index 2d07e6435f..0d268ca565 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -29,10 +29,14 @@
 #include "arm_compute/core/IMultiHOG.h"
 #include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 /** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels:
@@ -53,7 +57,7 @@ class NEHOGMultiDetection : public IFunction
 {
 public:
     /** Default constructor */
-    NEHOGMultiDetection();
+    NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEHOGMultiDetection(const NEHOGMultiDetection &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -85,6 +89,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                                                   _memory_group;
     NEHOGGradient                                                 _gradient_kernel;
     std::unique_ptr<NEHOGOrientationBinningKernel[]>              _orient_bin_kernel;
     std::unique_ptr<NEHOGBlockNormalizationKernel[]>              _block_norm_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEL2Normalize.h b/arm_compute/runtime/NEON/functions/NEL2Normalize.h
index 1297b99e79..95d5186c13 100644
--- a/arm_compute/runtime/NEON/functions/NEL2Normalize.h
+++ b/arm_compute/runtime/NEON/functions/NEL2Normalize.h
@@ -26,9 +26,13 @@
 
 #include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
@@ -43,7 +47,7 @@ class NEL2Normalize : public IFunction
 {
 public:
     /** Constructor */
-    NEL2Normalize();
+    NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in, out] input   Source tensor. Data types supported: F32. (Written to only for border_size != 0)
@@ -57,6 +61,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup          _memory_group;
     NEReductionOperation _reduce_func;
     NEL2NormalizeKernel  _normalize_kernel;
     Tensor               _sumsq;
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index efb2ff6c8b..18d2a1dfb5 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -31,8 +31,12 @@
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class INETensor;
@@ -48,7 +52,7 @@ class NELocallyConnectedLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NELocallyConnectedLayer();
+    NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -66,6 +70,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                            _memory_group;
     NEIm2ColKernel                         _input_im2col_kernel;
     NEWeightsReshapeKernel                 _weights_reshape_kernel;
     NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index 4cfea226f3..1c95c5bc4a 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -29,9 +29,12 @@
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +51,7 @@ class NENormalizationLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NENormalizationLayer();
+    NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -62,6 +65,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                     _memory_group;    /**< Function memory group */
     NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
     NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
     NEFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
index 0534551d19..5d1fbe3a22 100644
--- a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
+++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -51,7 +53,7 @@ class NEOpticalFlow : public IFunction
 {
 public:
     /** Constructor */
-    NEOpticalFlow();
+    NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEOpticalFlow(const NEOpticalFlow &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -80,6 +82,7 @@ public:
     void run() override;
 
 private:
+    MemoryGroup                          _memory_group;
     std::unique_ptr<NEScharr3x3[]>       _func_scharr;
     std::unique_ptr<NELKTrackerKernel[]> _kernel_tracker;
     std::unique_ptr<Tensor[]>            _scharr_gx;
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index e1da891dcf..91cda066e7 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -24,25 +24,30 @@
 #ifndef __ARM_COMPUTE_NESCALEIMAGE_H__
 #define __ARM_COMPUTE_NESCALEIMAGE_H__
 
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
 
 /** Basic function to run @ref NEScaleKernel */
-class NEScale : public INESimpleFunction
+class NEScale : public IFunction
 {
 public:
     /** Constructor
      *
      * Initialize NEScale
      */
-    NEScale();
+    NEScale(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
      * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -53,10 +58,16 @@ public:
      */
     void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
 
+    // Inherited methods overridden:
+    void run() override;
+
 private:
-    Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    Tensor _dx;      /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor _dy;      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    MemoryGroup        _memory_group;   /**< Function memory group */
+    Tensor             _offsets;        /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    Tensor             _dx;             /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor             _dy;             /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    NEScaleKernel      _scale_kernel;   /**< Kernel to perform the scaling */
+    NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */
 };
 }
 #endif /*__ARM_COMPUTE_NESCALEIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
index fc4d665a70..2b7cb70f15 100644
--- a/arm_compute/runtime/NEON/functions/NESobel5x5.h
+++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h
@@ -28,9 +28,12 @@
 #include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -47,7 +50,7 @@ class NESobel5x5 : public IFunction
 {
 public:
     /** Default constructor */
-    NESobel5x5();
+    NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -65,6 +68,7 @@ public:
     void run() override;
 
 protected:
+    MemoryGroup          _memory_group;   /**< Function memory group */
     NESobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
     NESobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
     Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
index 06b7c80ad6..5f7bab7cfd 100644
--- a/arm_compute/runtime/NEON/functions/NESobel7x7.h
+++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h
@@ -28,9 +28,12 @@
 #include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -47,7 +50,7 @@ class NESobel7x7 : public IFunction
 {
 public:
     /** Default constructor */
-    NESobel7x7();
+    NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -65,6 +68,7 @@ public:
     void run() override;
 
 protected:
+    MemoryGroup          _memory_group;   /**< Function memory group */
     NESobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
     NESobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
     Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 61ada49149..e4daceb071 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -77,6 +77,7 @@ def filter_clang_tidy_lines( lines ):
                ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
                ("NESoftmaxLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
                ("parameter 'memory_manager' is unused" in line) or
+               ("parameter 'memory_manager' is copied for each invocation but only used as a const reference" in line) or
                "3rdparty" in line):
                 print_context=False
                 continue
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 318cea2342..9be1df6ea4 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -41,8 +41,9 @@
 
 using namespace arm_compute;
 
-NECannyEdge::NECannyEdge() // NOLINT
-    : _sobel(),
+NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _sobel(),
       _gradient(),
       _non_max_suppr(),
       _edge_trace(),
@@ -93,6 +94,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
     _phase.allocator()->init(info);
     _nonmax.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Configure/Init sobelNxN
     if(gradient_size == 3)
     {
@@ -117,6 +122,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
         ARM_COMPUTE_ERROR("Gradient size not supported\n");
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_magnitude);
+    _memory_group.manage(&_phase);
+
     // Configure gradient
     if(use_fp16)
     {
@@ -131,6 +140,13 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
         _gradient = std::move(k);
     }
 
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_nonmax);
+
     // Configure non-maxima suppression
     _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
@@ -138,6 +154,10 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
     // it. If border mode is undefined filling the border is a nop.
     _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
 
+    // Allocate intermediate tensors
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+
     // Configure edge tracing
     _edge_trace.configure(&_nonmax, output);
 
@@ -145,10 +165,6 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
     _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
 
     // Allocate intermediate tensors
-    _gx.allocator()->allocate();
-    _gy.allocator()->allocate();
-    _phase.allocator()->allocate();
-    _magnitude.allocator()->allocate();
     _nonmax.allocator()->allocate();
 }
 
@@ -157,6 +173,8 @@ void NECannyEdge::run()
     ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
 
+    _memory_group.acquire();
+
     // Run sobelNxN
     _sobel->run();
 
@@ -177,4 +195,6 @@ void NECannyEdge::run()
 
     // Run edge tracing
     NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 249274ba32..f10ffa6d14 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -48,8 +48,8 @@ void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t
 }
 
 template <unsigned int matrix_size>
-NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -72,6 +72,10 @@ void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output
 
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
+        // Calculate scale
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -98,8 +102,12 @@ void                   NEConvolutionSquare<matrix_size>::run()
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
         NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+        _memory_group.release();
     }
     else
     {
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 810efe539f..a56a73c44a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-NEDirectConvolutionLayer::NEDirectConvolutionLayer()
-    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
 {
 }
 
@@ -46,6 +46,9 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
         _accumulator.allocator()->free();
     }
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_accumulator);
+
     // Allocate the intermediate accumulator tensor in case of fixed point input
     switch(output->info()->data_type())
     {
@@ -87,6 +90,10 @@ void NEDirectConvolutionLayer::run()
 {
     NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
 
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index dfcb3954ea..85b283cd41 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -36,8 +36,8 @@
 
 using namespace arm_compute;
 
-NEGEMM::NEGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
 {
 }
 
@@ -85,6 +85,10 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
         _tmp_a.allocator()->init(info_a);
         _tmp_b.allocator()->init(info_b);
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
@@ -109,6 +113,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
 
 void NEGEMM::run()
 {
+    _memory_group.acquire();
+
     if(!_run_vector_matrix_multiplication)
     {
         // Run interleave kernel
@@ -121,6 +127,8 @@ void NEGEMM::run()
     // Run matrix multiply kernel
     NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
+    _memory_group.release();
+
     // Run matrix addition kernel
     if(_run_addition)
     {
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index b64f769459..7413b28d03 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -34,8 +34,8 @@
 
 using namespace arm_compute;
 
-NEGEMMLowp::NEGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -63,6 +63,10 @@ void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output,
     _tmp_a.allocator()->init(info_a);
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
     _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
@@ -73,6 +77,8 @@ void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output,
 
 void NEGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
@@ -81,4 +87,6 @@ void NEGEMMLowp::run()
 
     /* Run matrix multiply kernel */
     NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index a1ce985633..f085975b1e 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NEGaussian5x5::NEGaussian5x5()
-    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
 {
 }
 
@@ -43,6 +43,9 @@ void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
     _tmp.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+
     // Create and configure kernels for the two passes
     _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
     _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
@@ -54,7 +57,11 @@ void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border
 
 void NEGaussian5x5::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a592f53d44..5e98269f47 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-NEHOGDescriptor::NEHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@ void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@ void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog
 
 void NEHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@ void NEHOGDescriptor::run()
 
     // Run block normalization kernel
     NEScheduler::get().schedule(&_block_norm, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 3e2640d631..efc8690ede 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -30,8 +30,9 @@
 
 using namespace arm_compute;
 
-NEHOGGradient::NEHOGGradient() // NOLINT
-    : _derivative(),
+NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _derivative(),
       _mag_phase(nullptr),
       _gx(),
       _gy()
@@ -51,6 +52,10 @@ void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
@@ -75,9 +80,13 @@ void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor
 
 void NEHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 1a038a2f62..8c834e2a93 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -32,8 +32,9 @@
 
 using namespace arm_compute;
 
-NEHOGMultiDetection::NEHOGMultiDetection() // NOLINT
-    : _gradient_kernel(),
+NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
       _orient_bin_kernel(),
       _block_norm_kernel(),
       _hog_detect_kernel(),
@@ -139,6 +140,10 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog,
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -164,10 +169,17 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog,
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure NETensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -178,10 +190,19 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog,
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     // Configure HOG detector kernel
     for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
     {
@@ -194,14 +215,6 @@ void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog,
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -212,6 +225,8 @@ void NEHOGMultiDetection::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -241,4 +256,6 @@ void NEHOGMultiDetection::run()
     {
         NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2Normalize.cpp
index 378d78e3f3..349a781b0b 100644
--- a/src/runtime/NEON/functions/NEL2Normalize.cpp
+++ b/src/runtime/NEON/functions/NEL2Normalize.cpp
@@ -28,13 +28,16 @@
 
 using namespace arm_compute;
 
-NEL2Normalize::NEL2Normalize()
-    : _reduce_func(), _normalize_kernel(), _sumsq()
+NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
 void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
 {
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
     // Configure Kernels
     _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
     _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
@@ -45,6 +48,10 @@ void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis
 
 void NEL2Normalize::run()
 {
+    _memory_group.acquire();
+
     _reduce_func.run();
     NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index e7c71e04d1..cb48598921 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-NELocallyConnectedLayer::NELocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -102,6 +103,10 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
@@ -123,6 +128,8 @@ void NELocallyConnectedLayer::run()
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
 
@@ -131,4 +138,6 @@ void NELocallyConnectedLayer::run()
 
     // Reshape output matrix
     NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 69ff32591f..e01ef6660d 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NENormalizationLayer::NENormalizationLayer()
-    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
 {
 }
 
@@ -44,6 +44,9 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, Norm
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_squared);
+
     // Configure kernels
     _norm_kernel.configure(input, &_input_squared, output, norm_info);
     _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
@@ -55,7 +58,11 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, Norm
 
 void NENormalizationLayer::run()
 {
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
     NEScheduler::get().schedule(&_border_handler, Window::DimY);
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 3e69a33897..e90d8f6270 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -37,8 +37,9 @@
 
 using namespace arm_compute;
 
-NEOpticalFlow::NEOpticalFlow() // NOLINT
-    : _func_scharr(),
+NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _func_scharr(),
       _kernel_tracker(),
       _scharr_gx(),
       _scharr_gy(),
@@ -97,6 +98,10 @@ void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyr
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
 
@@ -116,6 +121,8 @@ void NEOpticalFlow::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -124,4 +131,6 @@ void NEOpticalFlow::run()
         // Run Lucas-Kanade kernel
         NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 7fc352ab1f..6c5ac3c45b 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
@@ -86,10 +86,13 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
 }
 } // namespace
 
-NEScale::NEScale() // NOLINT
-    : _offsets(),
+NEScale::NEScale(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _offsets(),
       _dx(),
-      _dy()
+      _dy(),
+      _scale_kernel(),
+      _border_handler()
 {
 }
 
@@ -119,8 +122,6 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    auto k = arm_compute::support::cpp14::make_unique<NEScaleKernel>();
-
     // Check if the border mode is UNDEFINED
     const bool border_undefined = border_mode == BorderMode::UNDEFINED;
 
@@ -130,8 +131,9 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
         {
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
+            _memory_group.manage(&_offsets);
 
-            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -149,7 +151,12 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+            // Manage intermediate buffers
+            _memory_group.manage(&_offsets);
+            _memory_group.manage(&_dx);
+            _memory_group.manage(&_dy);
+
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -162,13 +169,22 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
         }
         case InterpolationPolicy::AREA:
         {
-            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEScale::run()
+{
+    _memory_group.acquire();
+
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 305d21122e..d8f4eda2ff 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel5x5::NESobel5x5()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
 void NESobel5x5::run()
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 57fe028567..5b6f60b338 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -32,8 +32,8 @@
 
 using namespace arm_compute;
 
-NESobel7x7::NESobel7x7()
-    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
 {
 }
 
@@ -50,6 +50,8 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -58,6 +60,7 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     else if(run_sobel_x)
     {
         _tmp_x.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_x);
         _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
@@ -65,6 +68,7 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
+        _memory_group.manage(&_tmp_y);
         _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
         _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
@@ -76,6 +80,11 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
 void NESobel7x7::run()
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+
+    _memory_group.acquire();
+
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+
+    _memory_group.release();
 }
-- 
cgit v1.2.1