From 8a94e7cec7b09a417a278425e2b56e7af5bf45d9 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 15 Sep 2017 19:06:47 +0100 Subject: COMPMID-534: Add MemoryManager support in OpenCL functions Adds support for: -CLConvolution -CLGEMM -CLGEMMLowp -CLHOGDescriptor -CLHOGGradient -CLHOGMultiDetection -CLL2Normalize -CLLocallyConnectedLayer -CLOpticalFlow -CLReductionOperation Change-Id: Ib13354d274ccf32ae933f3fbbad3ac3896cfd3bd Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87938 Tested-by: Kaizen Reviewed-by: Pablo Tello --- src/runtime/CL/functions/CLConvolution.cpp | 11 +++++-- src/runtime/CL/functions/CLGEMM.cpp | 12 +++++-- src/runtime/CL/functions/CLGEMMLowp.cpp | 12 +++++-- src/runtime/CL/functions/CLHOGDescriptor.cpp | 15 +++++++-- src/runtime/CL/functions/CLHOGGradient.cpp | 12 +++++-- src/runtime/CL/functions/CLHOGMultiDetection.cpp | 37 ++++++++++++++++------ src/runtime/CL/functions/CLL2Normalize.cpp | 11 +++++-- .../CL/functions/CLLocallyConnectedLayer.cpp | 13 ++++++-- src/runtime/CL/functions/CLOpticalFlow.cpp | 13 ++++++-- src/runtime/CL/functions/CLReductionOperation.cpp | 9 ++++-- 10 files changed, 117 insertions(+), 28 deletions(-) (limited to 'src/runtime/CL') diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp index 641044451d..a9b086773c 100644 --- a/src/runtime/CL/functions/CLConvolution.cpp +++ b/src/runtime/CL/functions/CLConvolution.cpp @@ -47,8 +47,8 @@ void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int1 } template -CLConvolutionSquare::CLConvolutionSquare() - : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() +CLConvolutionSquare::CLConvolutionSquare(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler() { } @@ -66,6 +66,9 @@ void CLConvolutionSquare::configure(ICLTensor *input, ICLTensor *ou std::pair type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size); _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first)); + // Manage intermediate buffers + _memory_group.manage(&_tmp); + if(scale == 0) { scale = calculate_matrix_scale(conv, matrix_size); @@ -92,8 +95,12 @@ void CLConvolutionSquare::run() if(_is_separable) { + _memory_group.acquire(); + CLScheduler::get().enqueue(_kernel_hor, false); CLScheduler::get().enqueue(_kernel_vert); + + _memory_group.release(); } else { diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 9867229a7c..a81d1138c0 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -38,8 +38,8 @@ using namespace arm_compute; -CLGEMM::CLGEMM() - : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false) +CLGEMM::CLGEMM(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false) { } @@ -86,6 +86,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position()); _tmp_b.allocator()->init(info_b); + // Manage intermediate buffers + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + // Configure interleave kernel _interleave_kernel.configure(a, &_tmp_a); @@ -115,6 +119,8 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * void CLGEMM::run() { + _memory_group.acquire(); + if(_is_interleaved_transposed) { // Run interleave kernel @@ -132,4 +138,6 @@ void CLGEMM::run() { CLScheduler::get().enqueue(_ma_kernel); } + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp index 45e011d8ce..db6d11c2c3 100644 --- a/src/runtime/CL/functions/CLGEMMLowp.cpp +++ b/src/runtime/CL/functions/CLGEMMLowp.cpp @@ -33,8 +33,8 @@ using namespace arm_compute; -CLGEMMLowp::CLGEMMLowp() - : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b() +CLGEMMLowp::CLGEMMLowp(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b() { } @@ -62,6 +62,10 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); _tmp_b.allocator()->init(info_b); + // Manage intermediate buffers + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + // Configure kernels _interleave_kernel.configure(a, &_tmp_a); _transpose_kernel.configure(b, &_tmp_b); @@ -74,6 +78,8 @@ void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *ou void CLGEMMLowp::run() { + _memory_group.acquire(); + /* Run interleave kernel */ CLScheduler::get().enqueue(_interleave_kernel, false); @@ -82,4 +88,6 @@ void CLGEMMLowp::run() /* Run matrix multiply kernel */ CLScheduler::get().enqueue(_mm_kernel, false); + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp index b1b5a03ac1..1470d5cdc1 100644 --- a/src/runtime/CL/functions/CLHOGDescriptor.cpp +++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp @@ -31,8 +31,8 @@ using namespace arm_compute; -CLHOGDescriptor::CLHOGDescriptor() - : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() +CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space() { } @@ -71,9 +71,16 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); _hog_space.allocator()->init(info_space); + // Manage intermediate buffers + _memory_group.manage(&_mag); + _memory_group.manage(&_phase); + // Initialise gradient kernel _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value); + // Manage intermediate buffers + _memory_group.manage(&_hog_space); + // Initialise orientation binning kernel _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info()); @@ -88,6 +95,8 @@ void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG void CLHOGDescriptor::run() { + _memory_group.acquire(); + // Run gradient _gradient.run(); @@ -96,4 +105,6 @@ void CLHOGDescriptor::run() // Run block normalization CLScheduler::get().enqueue(_block_norm); + + _memory_group.release(); } \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp index 2387474358..51aeaed5cf 100644 --- a/src/runtime/CL/functions/CLHOGGradient.cpp +++ b/src/runtime/CL/functions/CLHOGGradient.cpp @@ -29,8 +29,8 @@ using namespace arm_compute; -CLHOGGradient::CLHOGGradient() - : _derivative(), _mag_phase(), _gx(), _gy() +CLHOGGradient::CLHOGGradient(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy() { } @@ -47,6 +47,10 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL _gx.allocator()->init(info); _gy.allocator()->init(info); + // Manage intermediate buffers + _memory_group.manage(&_gx); + _memory_group.manage(&_gy); + // Initialise derivate kernel _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value); @@ -67,9 +71,13 @@ void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICL void CLHOGGradient::run() { + _memory_group.acquire(); + // Run derivative _derivative.run(); // Run magnitude/phase kernel CLScheduler::get().enqueue(_mag_phase); + + _memory_group.release(); } \ No newline at end of file diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp index 9eed355710..8012c2f60a 100644 --- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp +++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp @@ -34,8 +34,9 @@ using namespace arm_compute; -CLHOGMultiDetection::CLHOGMultiDetection() // NOLINT - : _gradient_kernel(), +CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), @@ -141,6 +142,10 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h TensorInfo info_phase(shape_img, Format::U8); _phase.allocator()->init(info_phase); + // Manage intermediate buffers + _memory_group.manage(&_mag); + _memory_group.manage(&_phase); + // Initialise gradient kernel _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value); @@ -166,10 +171,17 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h TensorInfo info_space(shape_hog_space, num_bins, DataType::F32); _hog_space[i].allocator()->init(info_space); + // Manage intermediate buffers + _memory_group.manage(_hog_space.get() + i); + // Initialise orientation binning kernel _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info()); } + // Allocate intermediate tensors + _mag.allocator()->allocate(); + _phase.allocator()->allocate(); + // Configure CLTensor for the normalized HOG space and block normalization kernel for(size_t i = 0; i < _num_block_norm_kernel; ++i) { @@ -180,10 +192,19 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height); _hog_norm_space[i].allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(_hog_norm_space.get() + i); + // Initialize block normalization kernel _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info()); } + // Allocate intermediate tensors + for(size_t i = 0; i < _num_orient_bin_kernel; ++i) + { + _hog_space[i].allocator()->allocate(); + } + detection_window_strides->map(CLScheduler::get().queue(), true); // Configure HOG detector kernel @@ -200,14 +221,6 @@ void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_h _non_maxima_kernel->configure(_detection_windows, min_distance); // Allocate intermediate tensors - _mag.allocator()->allocate(); - _phase.allocator()->allocate(); - - for(size_t i = 0; i < _num_orient_bin_kernel; ++i) - { - _hog_space[i].allocator()->allocate(); - } - for(size_t i = 0; i < _num_block_norm_kernel; ++i) { _hog_norm_space[i].allocator()->allocate(); @@ -218,6 +231,8 @@ void CLHOGMultiDetection::run() { ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function"); + _memory_group.acquire(); + // Reset detection window _detection_windows->clear(); @@ -250,4 +265,6 @@ void CLHOGMultiDetection::run() Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY); _detection_windows->unmap(CLScheduler::get().queue()); } + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2Normalize.cpp index 18d05beba2..99be8cae4c 100644 --- a/src/runtime/CL/functions/CLL2Normalize.cpp +++ b/src/runtime/CL/functions/CLL2Normalize.cpp @@ -34,13 +34,16 @@ using namespace arm_compute; -CLL2Normalize::CLL2Normalize() - : _reduce_func(), _normalize_kernel(), _sumsq() +CLL2Normalize::CLL2Normalize(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq() { } void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon) { + // Manage intermediate buffers + _memory_group.manage(&_sumsq); + // Configure kernels _reduce_func.configure(input, &_sumsq, axis, ReductionOperation::SUM_SQUARE); _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon); @@ -51,6 +54,10 @@ void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int void CLL2Normalize::run() { + _memory_group.acquire(); + _reduce_func.run(); CLScheduler::get().enqueue(_normalize_kernel, true); + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp index ef6fb50bbf..a89a45a044 100644 --- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp @@ -33,8 +33,9 @@ using namespace arm_compute; -CLLocallyConnectedLayer::CLLocallyConnectedLayer() - : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false) +CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), + _is_first_run(false) { } @@ -99,6 +100,10 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor shape_gemm.set(1, mat_input_rows); _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type())); + // Manage intermediate buffers + _memory_group.manage(&_input_im2col_reshaped); + _memory_group.manage(&_gemm_output); + // Configure kernels _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias); _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped); @@ -120,6 +125,8 @@ void CLLocallyConnectedLayer::run() CLScheduler::get().enqueue(_weights_reshape_kernel); } + _memory_group.acquire(); + // Run input reshaping CLScheduler::get().enqueue(_input_im2col_kernel); @@ -128,4 +135,6 @@ void CLLocallyConnectedLayer::run() // Reshape output matrix CLScheduler::get().enqueue(_output_col2im_kernel, false); + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp index 07ca2f91b4..d00b1b5099 100644 --- a/src/runtime/CL/functions/CLOpticalFlow.cpp +++ b/src/runtime/CL/functions/CLOpticalFlow.cpp @@ -37,8 +37,9 @@ using namespace arm_compute; -CLOpticalFlow::CLOpticalFlow() // NOLINT - : _tracker_init_kernel(), +CLOpticalFlow::CLOpticalFlow(std::shared_ptr memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), @@ -116,6 +117,10 @@ void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new _scharr_gx[i].allocator()->init(tensor_info); _scharr_gy[i].allocator()->init(tensor_info); + // Manage intermediate buffers + _memory_group.manage(_scharr_gx.get() + i); + _memory_group.manage(_scharr_gy.get() + i); + // Init Scharr kernel _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value); @@ -144,6 +149,8 @@ void CLOpticalFlow::run() { ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function"); + _memory_group.acquire(); + for(unsigned int level = _num_levels; level > 0; --level) { // Run Scharr kernel @@ -160,4 +167,6 @@ void CLOpticalFlow::run() } CLScheduler::get().enqueue(_tracker_finalize_kernel, true); + + _memory_group.release(); } diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 5bb33205ca..6643c9bd46 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -35,8 +35,8 @@ using namespace arm_compute; -CLReductionOperation::CLReductionOperation() - : _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages() +CLReductionOperation::CLReductionOperation(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages() { } @@ -59,6 +59,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign shape.set(0, ceil(shape.x() / 128.f)); auto *tensor = new CLTensor; tensor->allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position())); + _memory_group.manage(tensor); _sums_vector.push_back(tensor); } @@ -76,9 +77,13 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign void CLReductionOperation::run() { + _memory_group.acquire(); + for(unsigned int i = 0; i < _num_of_stages; ++i) { CLScheduler::get().enqueue(_border_handlers_vector[i], false); CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); } + + _memory_group.release(); } \ No newline at end of file -- cgit v1.2.1