aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-09-15 19:06:47 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit8a94e7cec7b09a417a278425e2b56e7af5bf45d9 (patch)
treee952f39903d4624bbd6445c9cc6c7dbcc1114026
parent658039bc4e06be34272eccf559a516a6b52f75f5 (diff)
downloadComputeLibrary-8a94e7cec7b09a417a278425e2b56e7af5bf45d9.tar.gz
COMPMID-534: Add MemoryManager support in OpenCL functions
Adds support for: -CLConvolution -CLGEMM -CLGEMMLowp -CLHOGDescriptor -CLHOGGradient -CLHOGMultiDetection -CLL2Normalize -CLLocallyConnectedLayer -CLOpticalFlow -CLReductionOperation Change-Id: Ib13354d274ccf32ae933f3fbbad3ac3896cfd3bd Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87938 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com>
-rw-r--r--arm_compute/runtime/CL/functions/CLConvolution.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMM.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowp.h12
-rw-r--r--arm_compute/runtime/CL/functions/CLHOGDescriptor.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLHOGGradient.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLHOGMultiDetection.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLL2Normalize.h6
-rw-r--r--arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h7
-rw-r--r--arm_compute/runtime/CL/functions/CLOpticalFlow.h5
-rw-r--r--arm_compute/runtime/CL/functions/CLReductionOperation.h5
-rw-r--r--src/runtime/CL/functions/CLConvolution.cpp11
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp12
-rw-r--r--src/runtime/CL/functions/CLGEMMLowp.cpp12
-rw-r--r--src/runtime/CL/functions/CLHOGDescriptor.cpp15
-rw-r--r--src/runtime/CL/functions/CLHOGGradient.cpp12
-rw-r--r--src/runtime/CL/functions/CLHOGMultiDetection.cpp37
-rw-r--r--src/runtime/CL/functions/CLL2Normalize.cpp11
-rw-r--r--src/runtime/CL/functions/CLLocallyConnectedLayer.cpp13
-rw-r--r--src/runtime/CL/functions/CLOpticalFlow.cpp13
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp9
20 files changed, 172 insertions, 41 deletions
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
index f526f6ff4a..bc05cb2a85 100644
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -27,11 +27,14 @@
#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include <cstdint>
+#include <memory>
namespace arm_compute
{
@@ -70,7 +73,7 @@ class CLConvolutionSquare : public IFunction
{
public:
/** Default constructor */
- CLConvolutionSquare();
+ CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Initialize the function's source, destination, conv and border_mode.
*
* @param[in,out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -86,6 +89,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group; /**< Function's memory group */
CLTensor _tmp; /**< temporary buffer for output of horizontal pass */
bool _is_separable; /**< true if the convolution can be separated */
CLSeparableConvolutionHorKernel<matrix_size> _kernel_hor; /**< kernel for horizontal pass of separated convolution */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 9b887305cb..2765b77b7d 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -29,8 +29,12 @@
#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
namespace arm_compute
{
@@ -48,7 +52,7 @@ class CLGEMM : public IFunction
{
public:
/** Default constructor. */
- CLGEMM();
+ CLGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Initialise the kernel's inputs and output
*
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -70,6 +74,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLGEMMInterleave4x4Kernel _interleave_kernel;
CLGEMMTranspose1xWKernel _transpose_kernel;
CLGEMMMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
index da8883c3f8..613fcaa7e0 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
@@ -25,12 +25,15 @@
#define __ARM_COMPUTE_CLGEMMLOWP_H__
#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
namespace arm_compute
{
@@ -47,7 +50,7 @@ class CLGEMMLowp : public IFunction
{
public:
/** Constructor */
- CLGEMMLowp();
+ CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Initialise the kernel's inputs, output
*
* @note GEMM_LOWP: low precision matrix multiply kernel
@@ -75,6 +78,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLGEMMInterleave4x4Kernel _interleave_kernel;
CLGEMMTranspose1xWKernel _transpose_kernel;
CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
index cdb23bff33..00d64f109f 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -26,9 +26,13 @@
#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
namespace arm_compute
{
@@ -44,7 +48,7 @@ class CLHOGDescriptor : public IFunction
{
public:
/** Default constructor */
- CLHOGDescriptor();
+ CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Initialise the function's source, destination, HOG data-object and border mode
*
* @param[in, out] input Input tensor. Data type supported: U8
@@ -60,6 +64,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLHOGGradient _gradient;
CLHOGOrientationBinningKernel _orient_bin;
CLHOGBlockNormalizationKernel _block_norm;
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
index e74a68497f..051e5860d7 100644
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -28,11 +28,14 @@
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLDerivative.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include <cstdint>
+#include <memory>
namespace arm_compute
{
@@ -46,7 +49,7 @@ class CLHOGGradient : public IFunction
{
public:
/** Default constructor */
- CLHOGGradient();
+ CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Initialise the function's source, destinations, phase type and border mode
*
* @param[in, out] input Input tensor. Data type supported: U8.
@@ -63,6 +66,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLDerivative _derivative;
CLMagnitudePhaseKernel _mag_phase;
CLTensor _gx;
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
index 3fe0fa932a..1ff986511e 100644
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -28,10 +28,14 @@
#include "arm_compute/core/CL/ICLMultiHOG.h"
#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
namespace arm_compute
{
@@ -53,7 +57,7 @@ class CLHOGMultiDetection : public IFunction
{
public:
/** Default constructor */
- CLHOGMultiDetection();
+ CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -85,6 +89,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLHOGGradient _gradient_kernel;
std::unique_ptr<CLHOGOrientationBinningKernel[]> _orient_bin_kernel;
std::unique_ptr<CLHOGBlockNormalizationKernel[]> _block_norm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLL2Normalize.h b/arm_compute/runtime/CL/functions/CLL2Normalize.h
index 52c562c61b..20af54eda2 100644
--- a/arm_compute/runtime/CL/functions/CLL2Normalize.h
+++ b/arm_compute/runtime/CL/functions/CLL2Normalize.h
@@ -26,11 +26,14 @@
#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include <cstdint>
+#include <memory>
namespace arm_compute
{
@@ -42,7 +45,7 @@ class CLL2Normalize : public IFunction
{
public:
/** Constructor */
- CLL2Normalize();
+ CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Set the input and output tensors.
*
@@ -57,6 +60,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLReductionOperation _reduce_func;
CLL2NormalizeKernel _normalize_kernel;
CLTensor _sumsq;
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index 5f4f1ba1d7..f56039f62a 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -31,7 +31,11 @@
#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
namespace arm_compute
{
@@ -48,7 +52,7 @@ class CLLocallyConnectedLayer : public IFunction
{
public:
/** Default constructor */
- CLLocallyConnectedLayer();
+ CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Set the input and output tensors.
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -66,6 +70,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
CLIm2ColKernel _input_im2col_kernel;
CLWeightsReshapeKernel _weights_reshape_kernel;
CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
index ca3f86100e..94dda186bf 100644
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -29,9 +29,11 @@
#include "arm_compute/core/IArray.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include <cstddef>
#include <cstdint>
@@ -57,7 +59,7 @@ class CLOpticalFlow : public IFunction
{
public:
/** Default constructor */
- CLOpticalFlow();
+ CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Prevent instances of this class from being copied (As this class contains pointers) */
CLOpticalFlow(const CLOpticalFlow &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -91,6 +93,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
std::unique_ptr<CLLKTrackerInitKernel[]> _tracker_init_kernel;
std::unique_ptr<CLLKTrackerStage0Kernel[]> _tracker_stage0_kernel;
std::unique_ptr<CLLKTrackerStage1Kernel[]> _tracker_stage1_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 89fdad2b24..09beabad8d 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -27,8 +27,10 @@
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include <cstdint>
#include <memory>
@@ -44,7 +46,7 @@ class CLReductionOperation : public IFunction
{
public:
/* Constructor */
- CLReductionOperation();
+ CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
/** Set the input and output tensors.
*
@@ -59,6 +61,7 @@ public:
void run() override;
private:
+ CLMemoryGroup _memory_group;
std::vector<CLTensor *> _sums_vector{ nullptr };
std::unique_ptr<CLReductionOperationKernel[]> _reduction_kernels_vector{ nullptr };
std::unique_ptr<CLFillBorderKernel[]> _border_handlers_vector{ nullptr };
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 641044451d..a9b086773c 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -47,8 +47,8 @@ void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int1
}
template <unsigned int matrix_size>
-CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
- : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
{
}
@@ -66,6 +66,9 @@ void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *ou
std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
_tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+
if(scale == 0)
{
scale = calculate_matrix_scale(conv, matrix_size);
@@ -92,8 +95,12 @@ void CLConvolutionSquare<matrix_size>::run()
if(_is_separable)
{
+ _memory_group.acquire();
+
CLScheduler::get().enqueue(_kernel_hor, false);
CLScheduler::get().enqueue(_kernel_vert);
+
+ _memory_group.release();
}
else
{
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 9867229a7c..a81d1138c0 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,8 +38,8 @@
using namespace arm_compute;
-CLGEMM::CLGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
{
}
@@ -86,6 +86,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure interleave kernel
_interleave_kernel.configure(a, &_tmp_a);
@@ -115,6 +119,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
void CLGEMM::run()
{
+ _memory_group.acquire();
+
if(_is_interleaved_transposed)
{
// Run interleave kernel
@@ -132,4 +138,6 @@ void CLGEMM::run()
{
CLScheduler::get().enqueue(_ma_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
index 45e011d8ce..db6d11c2c3 100644
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -33,8 +33,8 @@
using namespace arm_compute;
-CLGEMMLowp::CLGEMMLowp()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
{
}
@@ -62,6 +62,10 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
_tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
// Configure kernels
_interleave_kernel.configure(a, &_tmp_a);
_transpose_kernel.configure(b, &_tmp_b);
@@ -74,6 +78,8 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou
void CLGEMMLowp::run()
{
+ _memory_group.acquire();
+
/* Run interleave kernel */
CLScheduler::get().enqueue(_interleave_kernel, false);
@@ -82,4 +88,6 @@ void CLGEMMLowp::run()
/* Run matrix multiply kernel */
CLScheduler::get().enqueue(_mm_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index b1b5a03ac1..1470d5cdc1 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -31,8 +31,8 @@
using namespace arm_compute;
-CLHOGDescriptor::CLHOGDescriptor()
- : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
{
}
@@ -71,9 +71,16 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space.allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+ // Manage intermediate buffers
+ _memory_group.manage(&_hog_space);
+
// Initialise orientation binning kernel
_orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
@@ -88,6 +95,8 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG
void CLHOGDescriptor::run()
{
+ _memory_group.acquire();
+
// Run gradient
_gradient.run();
@@ -96,4 +105,6 @@ void CLHOGDescriptor::run()
// Run block normalization
CLScheduler::get().enqueue(_block_norm);
+
+ _memory_group.release();
} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 2387474358..51aeaed5cf 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -29,8 +29,8 @@
using namespace arm_compute;
-CLHOGGradient::CLHOGGradient()
- : _derivative(), _mag_phase(), _gx(), _gy()
+CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
{
}
@@ -47,6 +47,10 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL
_gx.allocator()->init(info);
_gy.allocator()->init(info);
+ // Manage intermediate buffers
+ _memory_group.manage(&_gx);
+ _memory_group.manage(&_gy);
+
// Initialise derivate kernel
_derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
@@ -67,9 +71,13 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL
void CLHOGGradient::run()
{
+ _memory_group.acquire();
+
// Run derivative
_derivative.run();
// Run magnitude/phase kernel
CLScheduler::get().enqueue(_mag_phase);
+
+ _memory_group.release();
} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 9eed355710..8012c2f60a 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -34,8 +34,9 @@
using namespace arm_compute;
-CLHOGMultiDetection::CLHOGMultiDetection() // NOLINT
- : _gradient_kernel(),
+CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _gradient_kernel(),
_orient_bin_kernel(),
_block_norm_kernel(),
_hog_detect_kernel(),
@@ -141,6 +142,10 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
TensorInfo info_phase(shape_img, Format::U8);
_phase.allocator()->init(info_phase);
+ // Manage intermediate buffers
+ _memory_group.manage(&_mag);
+ _memory_group.manage(&_phase);
+
// Initialise gradient kernel
_gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -166,10 +171,17 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
_hog_space[i].allocator()->init(info_space);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_space.get() + i);
+
// Initialise orientation binning kernel
_orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
// Configure CLTensor for the normalized HOG space and block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
@@ -180,10 +192,19 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
_hog_norm_space[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_hog_norm_space.get() + i);
+
// Initialize block normalization kernel
_block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
}
+ // Allocate intermediate tensors
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
detection_window_strides->map(CLScheduler::get().queue(), true);
// Configure HOG detector kernel
@@ -200,14 +221,6 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h
_non_maxima_kernel->configure(_detection_windows, min_distance);
// Allocate intermediate tensors
- _mag.allocator()->allocate();
- _phase.allocator()->allocate();
-
- for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
- {
- _hog_space[i].allocator()->allocate();
- }
-
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
_hog_norm_space[i].allocator()->allocate();
@@ -218,6 +231,8 @@ void CLHOGMultiDetection::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+ _memory_group.acquire();
+
// Reset detection window
_detection_windows->clear();
@@ -250,4 +265,6 @@ void CLHOGMultiDetection::run()
Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
_detection_windows->unmap(CLScheduler::get().queue());
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp
index 18d05beba2..99be8cae4c 100644
--- a/src/runtime/CL/functions/CLL2Normalize.cpp
+++ b/src/runtime/CL/functions/CLL2Normalize.cpp
@@ -34,13 +34,16 @@
using namespace arm_compute;
-CLL2Normalize::CLL2Normalize()
- : _reduce_func(), _normalize_kernel(), _sumsq()
+CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
}
void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
{
+ // Manage intermediate buffers
+ _memory_group.manage(&_sumsq);
+
// Configure kernels
_reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE);
_normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
@@ -51,6 +54,10 @@ void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int
void CLL2Normalize::run()
{
+ _memory_group.acquire();
+
_reduce_func.run();
CLScheduler::get().enqueue(_normalize_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index ef6fb50bbf..a89a45a044 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,8 +33,9 @@
using namespace arm_compute;
-CLLocallyConnectedLayer::CLLocallyConnectedLayer()
- : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false)
{
}
@@ -99,6 +100,10 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+ // Manage intermediate buffers
+ _memory_group.manage(&_input_im2col_reshaped);
+ _memory_group.manage(&_gemm_output);
+
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
@@ -120,6 +125,8 @@ void CLLocallyConnectedLayer::run()
CLScheduler::get().enqueue(_weights_reshape_kernel);
}
+ _memory_group.acquire();
+
// Run input reshaping
CLScheduler::get().enqueue(_input_im2col_kernel);
@@ -128,4 +135,6 @@ void CLLocallyConnectedLayer::run()
// Reshape output matrix
CLScheduler::get().enqueue(_output_col2im_kernel, false);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 07ca2f91b4..d00b1b5099 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -37,8 +37,9 @@
using namespace arm_compute;
-CLOpticalFlow::CLOpticalFlow() // NOLINT
- : _tracker_init_kernel(),
+CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _tracker_init_kernel(),
_tracker_stage0_kernel(),
_tracker_stage1_kernel(),
_tracker_finalize_kernel(),
@@ -116,6 +117,10 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new
_scharr_gx[i].allocator()->init(tensor_info);
_scharr_gy[i].allocator()->init(tensor_info);
+ // Manage intermediate buffers
+ _memory_group.manage(_scharr_gx.get() + i);
+ _memory_group.manage(_scharr_gy.get() + i);
+
// Init Scharr kernel
_func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
@@ -144,6 +149,8 @@ void CLOpticalFlow::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+ _memory_group.acquire();
+
for(unsigned int level = _num_levels; level > 0; --level)
{
// Run Scharr kernel
@@ -160,4 +167,6 @@ void CLOpticalFlow::run()
}
CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 5bb33205ca..6643c9bd46 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -35,8 +35,8 @@
using namespace arm_compute;
-CLReductionOperation::CLReductionOperation()
- : _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
{
}
@@ -59,6 +59,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
shape.set(0, ceil(shape.x() / 128.f));
auto *tensor = new CLTensor;
tensor->allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+ _memory_group.manage(tensor);
_sums_vector.push_back(tensor);
}
@@ -76,9 +77,13 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
void CLReductionOperation::run()
{
+ _memory_group.acquire();
+
for(unsigned int i = 0; i < _num_of_stages; ++i)
{
CLScheduler::get().enqueue(_border_handlers_vector[i], false);
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
+
+ _memory_group.release();
} \ No newline at end of file