From 8a94e7cec7b09a417a278425e2b56e7af5bf45d9 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 15 Sep 2017 19:06:47 +0100
Subject: COMPMID-534: Add MemoryManager support in OpenCL functions

Adds support for:
-CLConvolution
-CLGEMM
-CLGEMMLowp
-CLHOGDescriptor
-CLHOGGradient
-CLHOGMultiDetection
-CLL2Normalize
-CLLocallyConnectedLayer
-CLOpticalFlow
-CLReductionOperation

Change-Id: Ib13354d274ccf32ae933f3fbbad3ac3896cfd3bd
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87938
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 arm_compute/runtime/CL/functions/CLConvolution.h           |  6 +++++-
 arm_compute/runtime/CL/functions/CLGEMM.h                  |  7 ++++++-
 arm_compute/runtime/CL/functions/CLGEMMLowp.h              | 12 ++++++++----
 arm_compute/runtime/CL/functions/CLHOGDescriptor.h         |  7 ++++++-
 arm_compute/runtime/CL/functions/CLHOGGradient.h           |  6 +++++-
 arm_compute/runtime/CL/functions/CLHOGMultiDetection.h     |  7 ++++++-
 arm_compute/runtime/CL/functions/CLL2Normalize.h           |  6 +++++-
 arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h |  7 ++++++-
 arm_compute/runtime/CL/functions/CLOpticalFlow.h           |  5 ++++-
 arm_compute/runtime/CL/functions/CLReductionOperation.h    |  5 ++++-
 10 files changed, 55 insertions(+), 13 deletions(-)

(limited to 'arm_compute/runtime/CL')
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
index f526f6ff4a..bc05cb2a85 100644
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -27,11 +27,14 @@
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -70,7 +73,7 @@ class CLConvolutionSquare : public IFunction
 {
 public:
     /** Default constructor */
-    CLConvolutionSquare();
+    CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -86,6 +89,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                                 _memory_group;   /**< Function's memory group */
     CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
     bool                                          _is_separable;   /**< true if the convolution can be separated */
     CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 9b887305cb..2765b77b7d 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -29,8 +29,12 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +52,7 @@ class CLGEMM : public IFunction
 {
 public:
     /** Default constructor. */
-    CLGEMM();
+    CLGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs and output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -70,6 +74,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup              _memory_group;
     CLGEMMInterleave4x4Kernel  _interleave_kernel;
     CLGEMMTranspose1xWKernel   _transpose_kernel;
     CLGEMMMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
index da8883c3f8..613fcaa7e0 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
@@ -25,12 +25,15 @@
 #define __ARM_COMPUTE_CLGEMMLOWP_H__
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -47,7 +50,7 @@ class CLGEMMLowp : public IFunction
 {
 public:
     /** Constructor */
-    CLGEMMLowp();
+    CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the kernel's inputs, output
     *
     * @note GEMM_LOWP:  low precision matrix multiply kernel
@@ -75,6 +78,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                  _memory_group;
     CLGEMMInterleave4x4Kernel      _interleave_kernel;
     CLGEMMTranspose1xWKernel       _transpose_kernel;
     CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
index cdb23bff33..00d64f109f 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -26,9 +26,13 @@
 
 #include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -44,7 +48,7 @@ class CLHOGDescriptor : public IFunction
 {
 public:
     /** Default constructor */
-    CLHOGDescriptor();
+    CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destination, HOG data-object and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8
@@ -60,6 +64,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                 _memory_group;
     CLHOGGradient                 _gradient;
     CLHOGOrientationBinningKernel _orient_bin;
     CLHOGBlockNormalizationKernel _block_norm;
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
index e74a68497f..051e5860d7 100644
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -28,11 +28,14 @@
 
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -46,7 +49,7 @@ class CLHOGGradient : public IFunction
 {
 public:
     /** Default constructor */
-    CLHOGGradient();
+    CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Initialise the function's source, destinations, phase type and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8.
@@ -63,6 +66,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup          _memory_group;
     CLDerivative           _derivative;
     CLMagnitudePhaseKernel _mag_phase;
     CLTensor               _gx;
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
index 3fe0fa932a..1ff986511e 100644
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -28,10 +28,14 @@
 #include "arm_compute/core/CL/ICLMultiHOG.h"
 #include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
 #include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -53,7 +57,7 @@ class CLHOGMultiDetection : public IFunction
 {
 public:
     /** Default constructor */
-    CLHOGMultiDetection();
+    CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -85,6 +89,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                                                 _memory_group;
     CLHOGGradient                                                 _gradient_kernel;
     std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
     std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLL2Normalize.h b/arm_compute/runtime/CL/functions/CLL2Normalize.h
index 52c562c61b..20af54eda2 100644
--- a/arm_compute/runtime/CL/functions/CLL2Normalize.h
+++ b/arm_compute/runtime/CL/functions/CLL2Normalize.h
@@ -26,11 +26,14 @@
 
 #include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -42,7 +45,7 @@ class CLL2Normalize : public IFunction
 {
 public:
     /** Constructor */
-    CLL2Normalize();
+    CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input and output tensors.
      *
@@ -57,6 +60,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup        _memory_group;
     CLReductionOperation _reduce_func;
     CLL2NormalizeKernel  _normalize_kernel;
     CLTensor             _sumsq;
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index 5f4f1ba1d7..f56039f62a 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -31,7 +31,11 @@
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -48,7 +52,7 @@ class CLLocallyConnectedLayer : public IFunction
 {
 public:
     /** Default constructor */
-    CLLocallyConnectedLayer();
+    CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -66,6 +70,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                          _memory_group;
     CLIm2ColKernel                         _input_im2col_kernel;
     CLWeightsReshapeKernel                 _weights_reshape_kernel;
     CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
index ca3f86100e..94dda186bf 100644
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -29,9 +29,11 @@
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -57,7 +59,7 @@ class CLOpticalFlow : public IFunction
 {
 public:
     /** Default constructor */
-    CLOpticalFlow();
+    CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLOpticalFlow(const CLOpticalFlow &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -91,6 +93,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                              _memory_group;
     std::unique_ptr<CLLKTrackerInitKernel[]>   _tracker_init_kernel;
     std::unique_ptr<CLLKTrackerStage0Kernel[]> _tracker_stage0_kernel;
     std::unique_ptr<CLLKTrackerStage1Kernel[]> _tracker_stage1_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 89fdad2b24..09beabad8d 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -27,8 +27,10 @@
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 #include <cstdint>
 #include <memory>
@@ -44,7 +46,7 @@ class CLReductionOperation : public IFunction
 {
 public:
     /* Constructor */
-    CLReductionOperation();
+    CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input and output tensors.
      *
@@ -59,6 +61,7 @@ public:
     void run() override;
 
 private:
+    CLMemoryGroup                                 _memory_group;
     std::vector<CLTensor *>                       _sums_vector{ nullptr };
     std::unique_ptr<CLReductionOperationKernel[]> _reduction_kernels_vector{ nullptr };
     std::unique_ptr<CLFillBorderKernel[]>         _border_handlers_vector{ nullptr };
-- 
cgit v1.2.1