From 8a94e7cec7b09a417a278425e2b56e7af5bf45d9 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 15 Sep 2017 19:06:47 +0100
Subject: COMPMID-534: Add MemoryManager support in OpenCL functions

Adds support for:
-CLConvolution
-CLGEMM
-CLGEMMLowp
-CLHOGDescriptor
-CLHOGGradient
-CLHOGMultiDetection
-CLL2Normalize
-CLLocallyConnectedLayer
-CLOpticalFlow
-CLReductionOperation

Change-Id: Ib13354d274ccf32ae933f3fbbad3ac3896cfd3bd
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87938
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
---
 src/runtime/CL/functions/CLConvolution.cpp         | 11 +++++--
 src/runtime/CL/functions/CLGEMM.cpp                | 12 +++++--
 src/runtime/CL/functions/CLGEMMLowp.cpp            | 12 +++++--
 src/runtime/CL/functions/CLHOGDescriptor.cpp       | 15 +++++++--
 src/runtime/CL/functions/CLHOGGradient.cpp         | 12 +++++--
 src/runtime/CL/functions/CLHOGMultiDetection.cpp   | 37 ++++++++++++++++------
 src/runtime/CL/functions/CLL2Normalize.cpp         | 11 +++++--
 .../CL/functions/CLLocallyConnectedLayer.cpp       | 13 ++++++--
 src/runtime/CL/functions/CLOpticalFlow.cpp         | 13 ++++++--
 src/runtime/CL/functions/CLReductionOperation.cpp  |  9 ++++--
 10 files changed, 117 insertions(+), 28 deletions(-)

(limited to 'src/runtime/CL')
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 641044451d..a9b086773c 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -47,8 +47,8 @@ void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int1
 }
 
 template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
-    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
 {
 }
 
@@ -66,6 +66,9 @@ void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *ou
         std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
         _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp);
+
         if(scale == 0)
         {
             scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@ void                   CLConvolutionSquare<matrix_size>::run()
 
     if(_is_separable)
     {
+        _memory_group.acquire();
+
         CLScheduler::get().enqueue(_kernel_hor, false);
         CLScheduler::get().enqueue(_kernel_vert);
+
+        _memory_group.release();
     }
     else
     {
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 9867229a7c..a81d1138c0 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,8 +38,8 @@
 
 using namespace arm_compute;
 
-CLGEMM::CLGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
 {
 }
 
@@ -86,6 +86,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
         TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
         _tmp_b.allocator()->init(info_b);
 
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
@@ -115,6 +119,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
 
 void CLGEMM::run()
 {
+    _memory_group.acquire();
+
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
@@ -132,4 +138,6 @@ void CLGEMM::run()
     {
         CLScheduler::get().enqueue(_ma_kernel);
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d8ce..db6d11c2c3 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-CLGEMMLowp::CLGEMMLowp()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
 {
 }
 
@@ -62,6 +62,10 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou
     TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
     _tmp_b.allocator()->init(info_b);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    _memory_group.manage(&_tmp_b);
+
     // Configure kernels
     _interleave_kernel.configure(a, &_tmp_a);
     _transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou
 
 void CLGEMMLowp::run()
 {
+    _memory_group.acquire();
+
     /* Run interleave kernel */
     CLScheduler::get().enqueue(_interleave_kernel, false);
 
@@ -82,4 +88,6 @@ void CLGEMMLowp::run()
 
     /* Run matrix multiply kernel */
     CLScheduler::get().enqueue(_mm_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03ac1..1470d5cdc1 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
 
 using namespace arm_compute;
 
-CLHOGDescriptor::CLHOGDescriptor()
-    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
 {
 }
 
@@ -71,9 +71,16 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG
     TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
     _hog_space.allocator()->init(info_space);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_hog_space);
+
     // Initialise orientation binning kernel
     _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
 
@@ -88,6 +95,8 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG
 
 void CLHOGDescriptor::run()
 {
+    _memory_group.acquire();
+
     // Run gradient
     _gradient.run();
 
@@ -96,4 +105,6 @@ void CLHOGDescriptor::run()
 
     // Run block normalization
     CLScheduler::get().enqueue(_block_norm);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474358..51aeaed5cf 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
 
 using namespace arm_compute;
 
-CLHOGGradient::CLHOGGradient()
-    : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
 {
 }
 
@@ -47,6 +47,10 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL
     _gx.allocator()->init(info);
     _gy.allocator()->init(info);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_gx);
+    _memory_group.manage(&_gy);
+
     // Initialise derivate kernel
     _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
 
@@ -67,9 +71,13 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL
 
 void CLHOGGradient::run()
 {
+    _memory_group.acquire();
+
     // Run derivative
     _derivative.run();
 
     // Run magnitude/phase kernel
     CLScheduler::get().enqueue(_mag_phase);
+
+    _memory_group.release();
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 9eed355710..8012c2f60a 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -34,8 +34,9 @@
 
 using namespace arm_compute;
 
-CLHOGMultiDetection::CLHOGMultiDetection() // NOLINT
-    : _gradient_kernel(),
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _gradient_kernel(),
       _orient_bin_kernel(),
       _block_norm_kernel(),
       _hog_detect_kernel(),
@@ -141,6 +142,10 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
     TensorInfo info_phase(shape_img, Format::U8);
     _phase.allocator()->init(info_phase);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_mag);
+    _memory_group.manage(&_phase);
+
     // Initialise gradient kernel
     _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
 
@@ -166,10 +171,17 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
         TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
         _hog_space[i].allocator()->init(info_space);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_space.get() + i);
+
         // Initialise orientation binning kernel
         _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
     // Configure CLTensor for the normalized HOG space and block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
@@ -180,10 +192,19 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
         TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
         _hog_norm_space[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_hog_norm_space.get() + i);
+
         // Initialize block normalization kernel
         _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
     }
 
+    // Allocate intermediate tensors
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
     detection_window_strides->map(CLScheduler::get().queue(), true);
 
     // Configure HOG detector kernel
@@ -200,14 +221,6 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
     _non_maxima_kernel->configure(_detection_windows, min_distance);
 
     // Allocate intermediate tensors
-    _mag.allocator()->allocate();
-    _phase.allocator()->allocate();
-
-    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
-    {
-        _hog_space[i].allocator()->allocate();
-    }
-
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
         _hog_norm_space[i].allocator()->allocate();
@@ -218,6 +231,8 @@ void CLHOGMultiDetection::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
 
+    _memory_group.acquire();
+
     // Reset detection window
     _detection_windows->clear();
 
@@ -250,4 +265,6 @@ void CLHOGMultiDetection::run()
         Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
index 18d05beba2..99be8cae4c 100644
--- a/src/runtime/CL/functions/CLL2Normalize.cpp
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -34,13 +34,16 @@
 
 using namespace arm_compute;
 
-CLL2Normalize::CLL2Normalize()
-    : _reduce_func(), _normalize_kernel(), _sumsq()
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
 void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
 {
+    // Manage intermediate buffers
+    _memory_group.manage(&_sumsq);
+
     // Configure kernels
     _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
     _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
@@ -51,6 +54,10 @@ void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int
 
 void CLL2Normalize::run()
 {
+    _memory_group.acquire();
+
     _reduce_func.run();
     CLScheduler::get().enqueue(_normalize_kernel, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index ef6fb50bbf..a89a45a044 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
-    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false)
 {
 }
 
@@ -99,6 +100,10 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
     shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_input_im2col_reshaped);
+    _memory_group.manage(&_gemm_output);
+
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
@@ -120,6 +125,8 @@ void CLLocallyConnectedLayer::run()
         CLScheduler::get().enqueue(_weights_reshape_kernel);
     }
 
+    _memory_group.acquire();
+
     // Run input reshaping
     CLScheduler::get().enqueue(_input_im2col_kernel);
 
@@ -128,4 +135,6 @@ void CLLocallyConnectedLayer::run()
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 07ca2f91b4..d00b1b5099 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -37,8 +37,9 @@
 
 using namespace arm_compute;
 
-CLOpticalFlow::CLOpticalFlow() // NOLINT
-    : _tracker_init_kernel(),
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _tracker_init_kernel(),
       _tracker_stage0_kernel(),
       _tracker_stage1_kernel(),
       _tracker_finalize_kernel(),
@@ -116,6 +117,10 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new
         _scharr_gx[i].allocator()->init(tensor_info);
         _scharr_gy[i].allocator()->init(tensor_info);
 
+        // Manage intermediate buffers
+        _memory_group.manage(_scharr_gx.get() + i);
+        _memory_group.manage(_scharr_gy.get() + i);
+
         // Init Scharr kernel
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
@@ -144,6 +149,8 @@ void CLOpticalFlow::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
 
+    _memory_group.acquire();
+
     for(unsigned int level = _num_levels; level > 0; --level)
     {
         // Run Scharr kernel
@@ -160,4 +167,6 @@ void CLOpticalFlow::run()
     }
 
     CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+    _memory_group.release();
 }
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 5bb33205ca..6643c9bd46 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -35,8 +35,8 @@
 
 using namespace arm_compute;
 
-CLReductionOperation::CLReductionOperation()
-    : _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
 {
 }
 
@@ -59,6 +59,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
         shape.set(0, ceil(shape.x() / 128.f));
         auto *tensor = new CLTensor;
         tensor->allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _memory_group.manage(tensor);
         _sums_vector.push_back(tensor);
     }
 
@@ -76,9 +77,13 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 
 void CLReductionOperation::run()
 {
+    _memory_group.acquire();
+
     for(unsigned int i = 0; i < _num_of_stages; ++i)
     {
         CLScheduler::get().enqueue(_border_handlers_vector[i], false);
         CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
     }
+
+    _memory_group.release();
 }
\ No newline at end of file
-- 
cgit v1.2.1