From b94c368a39ba6518d843c66a156f825a2c66af60 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 27 Sep 2019 19:07:15 +0100 Subject: COMPMID-2645: Detection Post-Process fixes -Uses temporary buffers withing run() to avoid keeping state -Reworks and cleans the implementation Change-Id: I34b5bad7f8a07e284214ec61de9cf3a2eb1f608c Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/2000 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Isabella Gottardi --- .../CPP/kernels/CPPNonMaximumSuppressionKernel.h | 5 - .../CPP/functions/CPPDetectionPostProcessLayer.h | 9 -- examples/graph_ssd_mobilenet.cpp | 1 - .../CPP/kernels/CPPNonMaximumSuppressionKernel.cpp | 60 ++++++------ .../CPP/functions/CPPDetectionPostProcessLayer.cpp | 103 ++++++++++----------- 5 files changed, 78 insertions(+), 100 deletions(-) diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h index dd035d5cb1..351b9842c7 100644 --- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h +++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h @@ -92,11 +92,6 @@ private: float _iou_threshold; unsigned int _num_boxes; - - std::vector _scores_above_thd_vector; - std::vector _indices_above_thd_vector; - std::vector _visited; - std::vector _sorted_indices; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H__ */ diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h index c13def67c7..1c918d220c 100644 --- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h +++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h @@ -109,15 +109,6 @@ private: Tensor _selected_indices; Tensor _class_scores; const ITensor *_input_scores_to_use; - - // Intermediate results - std::vector _result_idx_boxes_after_nms; - std::vector _result_classes_after_nms; - std::vector _result_scores_after_nms; - std::vector _sorted_indices; - - // Temporary values - std::vector _box_scores; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H__ */ diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp index 01d7a866e4..233d22c8ed 100644 --- a/examples/graph_ssd_mobilenet.cpp +++ b/examples/graph_ssd_mobilenet.cpp @@ -519,7 +519,6 @@ private: // Quantization info taken from the TfLite SSD MobileNet example const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128); - // Create core graph graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info), get_weights_accessor(data_path, common_params.image, DataLayout::NHWC)); diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp index f7edf8edd0..750f44d972 100644 --- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp +++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp @@ -28,7 +28,7 @@ #include "arm_compute/core/Validate.h" #include "support/ToolchainSupport.h" -#include +#include namespace arm_compute { @@ -54,14 +54,12 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, } // namespace CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel() - : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0), _scores_above_thd_vector(), - _indices_above_thd_vector(), _visited(), _sorted_indices() + : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0) { } -void CPPNonMaximumSuppressionKernel::configure( - const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, - const float score_threshold, const float iou_threshold) +void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, + unsigned int max_output_size, const float score_threshold, const float iou_threshold) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold)); @@ -76,14 +74,6 @@ void CPPNonMaximumSuppressionKernel::configure( _max_output_size = max_output_size; _num_boxes = input_scores->info()->dimension(0); - _scores_above_thd_vector.reserve(_num_boxes); - _indices_above_thd_vector.reserve(_num_boxes); - - // Visited and sorted_indices are preallocated as num_boxes size, which is the maximum size possible - // Will be used only N elements where N is the number of score above the threshold - _visited.reserve(_num_boxes); - _sorted_indices.reserve(_num_boxes); - // Configure kernel window Window win = calculate_max_window(*output_indices->info(), Steps()); @@ -91,9 +81,8 @@ void CPPNonMaximumSuppressionKernel::configure( ICPPKernel::configure(win); } -Status CPPNonMaximumSuppressionKernel::validate( - const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size, - const float score_threshold, const float iou_threshold) +Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, + unsigned int max_output_size, const float score_threshold, const float iou_threshold) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold)); return Status{}; @@ -106,33 +95,37 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - unsigned int num_above_thd = 0; + // Auxiliary tensors + std::vector indices_above_thd; + std::vector scores_above_thd; for(unsigned int i = 0; i < _num_boxes; ++i) { const float score_i = *(reinterpret_cast(_input_scores->ptr_to_element(Coordinates(i)))); if(score_i >= _score_threshold) { - _indices_above_thd_vector.emplace_back(i); - _scores_above_thd_vector.emplace_back(score_i); - // Initialize respective index and visited - _sorted_indices.emplace_back(num_above_thd); - _visited.push_back(false); - ++num_above_thd; + scores_above_thd.emplace_back(score_i); + indices_above_thd.emplace_back(i); } } // Sort selected indices based on scores - std::sort(_sorted_indices.begin(), - _sorted_indices.end(), + const unsigned int num_above_thd = indices_above_thd.size(); + std::vector sorted_indices; + sorted_indices.resize(num_above_thd); + std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0); + std::sort(std::begin(sorted_indices), + std::end(sorted_indices), [&](unsigned int first, unsigned int second) { - return _scores_above_thd_vector[first] > _scores_above_thd_vector[second]; + return scores_above_thd[first] > scores_above_thd[second]; }); // Number of output is the minimum between max_detection and the scores above the threshold const unsigned int num_output = std::min(_max_output_size, num_above_thd); unsigned int output_idx = 0; + std::vector visited(num_above_thd, false); + // Keep only boxes with small IoU for(unsigned int i = 0; i < num_above_thd; ++i) { // Check if the output is full @@ -142,9 +135,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo } // Check if it was already visited, if not add it to the output and update the indices counter - if(!_visited[_sorted_indices[i]]) + if(!visited[sorted_indices[i]]) { - *(reinterpret_cast(_output_indices->ptr_to_element(Coordinates(output_idx)))) = _indices_above_thd_vector[_sorted_indices[i]]; + *(reinterpret_cast(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]]; + visited[sorted_indices[i]] = true; ++output_idx; } else @@ -155,11 +149,11 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo // Once added one element at the output check if the next ones overlap and can be skipped for(unsigned int j = i + 1; j < num_above_thd; ++j) { - if(!_visited[_sorted_indices[j]]) + if(!visited[sorted_indices[j]]) { // Calculate IoU - const unsigned int i_index = _indices_above_thd_vector[_sorted_indices[i]]; - const unsigned int j_index = _indices_above_thd_vector[_sorted_indices[j]]; + const unsigned int i_index = indices_above_thd[sorted_indices[i]]; + const unsigned int j_index = indices_above_thd[sorted_indices[j]]; // Box-corner format: xmin, ymin, xmax, ymax const auto box_i_xmin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(0, i_index)))); const auto box_i_ymin = *(reinterpret_cast(_input_bboxes->ptr_to_element(Coordinates(1, i_index)))); @@ -190,7 +184,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo if(overlap > _iou_threshold) { - _visited[_sorted_indices[j]] = true; + visited[sorted_indices[j]] = true; } } } diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp index 2997b593c6..7b4f7b97c4 100644 --- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp +++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp @@ -156,7 +156,7 @@ void SaveOutputs(const Tensor *decoded_boxes, const std::vector &result_idx std::vector &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection) { - // ymin,xmin,ymax,xmax -> xmin,ymin,xmax,ymax + // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax unsigned int i = 0; for(; i < num_output; ++i) { @@ -184,7 +184,7 @@ void SaveOutputs(const Tensor *decoded_boxes, const std::vector &result_idx CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr), _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _decoded_boxes(), _decoded_scores(), _selected_indices(), - _class_scores(), _input_scores_to_use(nullptr), _result_idx_boxes_after_nms(), _result_classes_after_nms(), _result_scores_after_nms(), _sorted_indices(), _box_scores() + _class_scores(), _input_scores_to_use(nullptr) { } @@ -217,8 +217,7 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32)); auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32)); - auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32)); - + auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32)); const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes()); auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32)); @@ -236,21 +235,6 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, _decoded_scores.allocator()->allocate(); _selected_indices.allocator()->allocate(); _class_scores.allocator()->allocate(); - - if(info.use_regular_nms()) - { - _result_idx_boxes_after_nms.reserve(_info.detection_per_class() * _info.num_classes()); - _result_classes_after_nms.reserve(_info.detection_per_class() * _info.num_classes()); - _result_scores_after_nms.reserve(_info.detection_per_class() * _info.num_classes()); - } - else - { - _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes); - _result_classes_after_nms.reserve(num_classes_per_box * _num_boxes); - _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes); - _box_scores.reserve(_num_boxes); - } - _sorted_indices.resize(info.use_regular_nms() ? info.max_detections() : info.num_classes()); } Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, @@ -288,9 +272,15 @@ void CPPDetectionPostProcessLayer::run() } } } + // Regular NMS if(_info.use_regular_nms()) { + std::vector result_idx_boxes_after_nms; + std::vector result_classes_after_nms; + std::vector result_scores_after_nms; + std::vector sorted_indices; + for(unsigned int c = 0; c < num_classes; ++c) { // For each boxes get scores of the boxes for the class c @@ -299,6 +289,8 @@ void CPPDetectionPostProcessLayer::run() *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(i)))) = *(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1 } + + // Run Non-maxima Suppression _nms.run(); for(unsigned int i = 0; i < _info.detection_per_class(); ++i) @@ -307,67 +299,74 @@ void CPPDetectionPostProcessLayer::run() if(selected_index == -1) { // Nms will return -1 for all the last M-elements not valid - continue; + break; } - _result_idx_boxes_after_nms.emplace_back(selected_index); - _result_scores_after_nms.emplace_back((reinterpret_cast(_class_scores.buffer()))[selected_index]); - _result_classes_after_nms.emplace_back(c); + result_idx_boxes_after_nms.emplace_back(selected_index); + result_scores_after_nms.emplace_back((reinterpret_cast(_class_scores.buffer()))[selected_index]); + result_classes_after_nms.emplace_back(c); } } // We select the max detection numbers of the highest score of all classes - const auto num_selected = _result_idx_boxes_after_nms.size(); + const auto num_selected = result_scores_after_nms.size(); const auto num_output = std::min(max_detections, num_selected); // Sort selected indices based on result scores - std::iota(_sorted_indices.begin(), _sorted_indices.end(), 0); - std::partial_sort(_sorted_indices.data(), - _sorted_indices.data() + num_output, - _sorted_indices.data() + num_selected, + sorted_indices.resize(num_selected); + std::iota(sorted_indices.begin(), sorted_indices.end(), 0); + std::partial_sort(sorted_indices.data(), + sorted_indices.data() + num_output, + sorted_indices.data() + num_selected, [&](unsigned int first, unsigned int second) { - return _result_scores_after_nms[first] > _result_scores_after_nms[second]; + return result_scores_after_nms[first] > result_scores_after_nms[second]; }); - SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms, - _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices, + num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } // Fast NMS else { const unsigned int num_classes_per_box = std::min(_info.max_classes_per_detection(), _info.num_classes()); - for(unsigned int b = 0, index = 0; b < _num_boxes; ++b) + std::vector max_scores; + std::vector box_indices; + std::vector max_score_classes; + + for(unsigned int b = 0; b < _num_boxes; ++b) { - _box_scores.clear(); - _sorted_indices.clear(); + std::vector box_scores; for(unsigned int c = 0; c < num_classes; ++c) { - _box_scores.emplace_back(*(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); - _sorted_indices.push_back(c); + box_scores.emplace_back(*(reinterpret_cast(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b))))); } - std::partial_sort(_sorted_indices.data(), - _sorted_indices.data() + num_classes_per_box, - _sorted_indices.data() + num_classes, + + std::vector max_score_indices; + max_score_indices.resize(_info.num_classes()); + std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0); + std::partial_sort(max_score_indices.data(), + max_score_indices.data() + num_classes_per_box, + max_score_indices.data() + num_classes, [&](unsigned int first, unsigned int second) { - return _box_scores[first] > _box_scores[second]; + return box_scores[first] > box_scores[second]; }); - for(unsigned int i = 0; i < num_classes_per_box; ++i, ++index) + for(unsigned int i = 0; i < num_classes_per_box; ++i) { - const float score_to_add = _box_scores[_sorted_indices[i]]; - *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(index)))) = score_to_add; - _result_scores_after_nms.emplace_back(score_to_add); - _result_idx_boxes_after_nms.emplace_back(b); - _result_classes_after_nms.emplace_back(_sorted_indices[i]); + const float score_to_add = box_scores[max_score_indices[i]]; + *(reinterpret_cast(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add; + max_scores.emplace_back(score_to_add); + box_indices.emplace_back(b); + max_score_classes.emplace_back(max_score_indices[i]); } } - // Run NMS + // Run Non-maxima Suppression _nms.run(); - _sorted_indices.clear(); + std::vector selected_indices; for(unsigned int i = 0; i < max_detections; ++i) { // NMS returns M valid indices, the not valid tail is filled with -1 @@ -376,13 +375,13 @@ void CPPDetectionPostProcessLayer::run() // Nms will return -1 for all the last M-elements not valid break; } - _sorted_indices.emplace_back(*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i))))); + selected_indices.emplace_back(*(reinterpret_cast(_selected_indices.ptr_to_element(Coordinates(i))))); } // We select the max detection numbers of the highest score of all classes - const auto num_output = std::min(_info.max_detections(), _sorted_indices.size()); + const auto num_output = std::min(_info.max_detections(), selected_indices.size()); - SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms, - _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); + SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, + num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection); } } } // namespace arm_compute \ No newline at end of file -- cgit v1.2.1