aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-09-27 19:07:15 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-10-01 15:04:42 +0000
commitb94c368a39ba6518d843c66a156f825a2c66af60 (patch)
tree384c4e4ed83389f300f491a63cd70a23e646babe
parent3d8fe497fc76ec6ad265c03fe02e29ed2ddf2d93 (diff)
downloadComputeLibrary-b94c368a39ba6518d843c66a156f825a2c66af60.tar.gz
COMPMID-2645: Detection Post-Process fixes
-Uses temporary buffers withing run() to avoid keeping state -Reworks and cleans the implementation Change-Id: I34b5bad7f8a07e284214ec61de9cf3a2eb1f608c Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/2000 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
-rw-r--r--arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h5
-rw-r--r--arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h9
-rw-r--r--examples/graph_ssd_mobilenet.cpp1
-rw-r--r--src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp60
-rw-r--r--src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp103
5 files changed, 78 insertions, 100 deletions
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index dd035d5cb1..351b9842c7 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -92,11 +92,6 @@ private:
float _iou_threshold;
unsigned int _num_boxes;
-
- std::vector<float> _scores_above_thd_vector;
- std::vector<int> _indices_above_thd_vector;
- std::vector<bool> _visited;
- std::vector<unsigned int> _sorted_indices;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H__ */
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index c13def67c7..1c918d220c 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
@@ -109,15 +109,6 @@ private:
Tensor _selected_indices;
Tensor _class_scores;
const ITensor *_input_scores_to_use;
-
- // Intermediate results
- std::vector<int> _result_idx_boxes_after_nms;
- std::vector<int> _result_classes_after_nms;
- std::vector<float> _result_scores_after_nms;
- std::vector<unsigned int> _sorted_indices;
-
- // Temporary values
- std::vector<float> _box_scores;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H__ */
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 01d7a866e4..233d22c8ed 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -519,7 +519,6 @@ private:
// Quantization info taken from the TfLite SSD MobileNet example
const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128);
-
// Create core graph
graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
get_weights_accessor(data_path, common_params.image, DataLayout::NHWC));
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index f7edf8edd0..750f44d972 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -28,7 +28,7 @@
#include "arm_compute/core/Validate.h"
#include "support/ToolchainSupport.h"
-#include <list>
+#include <algorithm>
namespace arm_compute
{
@@ -54,14 +54,12 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
} // namespace
CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
- : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0), _scores_above_thd_vector(),
- _indices_above_thd_vector(), _visited(), _sorted_indices()
+ : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
{
}
-void CPPNonMaximumSuppressionKernel::configure(
- const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size,
- const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
+ unsigned int max_output_size, const float score_threshold, const float iou_threshold)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
@@ -76,14 +74,6 @@ void CPPNonMaximumSuppressionKernel::configure(
_max_output_size = max_output_size;
_num_boxes = input_scores->info()->dimension(0);
- _scores_above_thd_vector.reserve(_num_boxes);
- _indices_above_thd_vector.reserve(_num_boxes);
-
- // Visited and sorted_indices are preallocated as num_boxes size, which is the maximum size possible
- // Will be used only N elements where N is the number of score above the threshold
- _visited.reserve(_num_boxes);
- _sorted_indices.reserve(_num_boxes);
-
// Configure kernel window
Window win = calculate_max_window(*output_indices->info(), Steps());
@@ -91,9 +81,8 @@ void CPPNonMaximumSuppressionKernel::configure(
ICPPKernel::configure(win);
}
-Status CPPNonMaximumSuppressionKernel::validate(
- const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
- const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
+ unsigned int max_output_size, const float score_threshold, const float iou_threshold)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
return Status{};
@@ -106,33 +95,37 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- unsigned int num_above_thd = 0;
+ // Auxiliary tensors
+ std::vector<int> indices_above_thd;
+ std::vector<float> scores_above_thd;
for(unsigned int i = 0; i < _num_boxes; ++i)
{
const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
if(score_i >= _score_threshold)
{
- _indices_above_thd_vector.emplace_back(i);
- _scores_above_thd_vector.emplace_back(score_i);
- // Initialize respective index and visited
- _sorted_indices.emplace_back(num_above_thd);
- _visited.push_back(false);
- ++num_above_thd;
+ scores_above_thd.emplace_back(score_i);
+ indices_above_thd.emplace_back(i);
}
}
// Sort selected indices based on scores
- std::sort(_sorted_indices.begin(),
- _sorted_indices.end(),
+ const unsigned int num_above_thd = indices_above_thd.size();
+ std::vector<unsigned int> sorted_indices;
+ sorted_indices.resize(num_above_thd);
+ std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
+ std::sort(std::begin(sorted_indices),
+ std::end(sorted_indices),
[&](unsigned int first, unsigned int second)
{
- return _scores_above_thd_vector[first] > _scores_above_thd_vector[second];
+ return scores_above_thd[first] > scores_above_thd[second];
});
// Number of output is the minimum between max_detection and the scores above the threshold
const unsigned int num_output = std::min(_max_output_size, num_above_thd);
unsigned int output_idx = 0;
+ std::vector<bool> visited(num_above_thd, false);
+ // Keep only boxes with small IoU
for(unsigned int i = 0; i < num_above_thd; ++i)
{
// Check if the output is full
@@ -142,9 +135,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
}
// Check if it was already visited, if not add it to the output and update the indices counter
- if(!_visited[_sorted_indices[i]])
+ if(!visited[sorted_indices[i]])
{
- *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = _indices_above_thd_vector[_sorted_indices[i]];
+ *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
+ visited[sorted_indices[i]] = true;
++output_idx;
}
else
@@ -155,11 +149,11 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
// Once added one element at the output check if the next ones overlap and can be skipped
for(unsigned int j = i + 1; j < num_above_thd; ++j)
{
- if(!_visited[_sorted_indices[j]])
+ if(!visited[sorted_indices[j]])
{
// Calculate IoU
- const unsigned int i_index = _indices_above_thd_vector[_sorted_indices[i]];
- const unsigned int j_index = _indices_above_thd_vector[_sorted_indices[j]];
+ const unsigned int i_index = indices_above_thd[sorted_indices[i]];
+ const unsigned int j_index = indices_above_thd[sorted_indices[j]];
// Box-corner format: xmin, ymin, xmax, ymax
const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
@@ -190,7 +184,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
if(overlap > _iou_threshold)
{
- _visited[_sorted_indices[j]] = true;
+ visited[sorted_indices[j]] = true;
}
}
}
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 2997b593c6..7b4f7b97c4 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -156,7 +156,7 @@ void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx
std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
ITensor *num_detection)
{
- // ymin,xmin,ymax,xmax -> xmin,ymin,xmax,ymax
+ // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
unsigned int i = 0;
for(; i < num_output; ++i)
{
@@ -184,7 +184,7 @@ void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx
CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
_output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _decoded_boxes(), _decoded_scores(), _selected_indices(),
- _class_scores(), _input_scores_to_use(nullptr), _result_idx_boxes_after_nms(), _result_classes_after_nms(), _result_scores_after_nms(), _sorted_indices(), _box_scores()
+ _class_scores(), _input_scores_to_use(nullptr)
{
}
@@ -217,8 +217,7 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
- auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32));
-
+ auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
@@ -236,21 +235,6 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
_decoded_scores.allocator()->allocate();
_selected_indices.allocator()->allocate();
_class_scores.allocator()->allocate();
-
- if(info.use_regular_nms())
- {
- _result_idx_boxes_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
- _result_classes_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
- _result_scores_after_nms.reserve(_info.detection_per_class() * _info.num_classes());
- }
- else
- {
- _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes);
- _result_classes_after_nms.reserve(num_classes_per_box * _num_boxes);
- _result_scores_after_nms.reserve(num_classes_per_box * _num_boxes);
- _box_scores.reserve(_num_boxes);
- }
- _sorted_indices.resize(info.use_regular_nms() ? info.max_detections() : info.num_classes());
}
Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
@@ -288,9 +272,15 @@ void CPPDetectionPostProcessLayer::run()
}
}
}
+
// Regular NMS
if(_info.use_regular_nms())
{
+ std::vector<int> result_idx_boxes_after_nms;
+ std::vector<int> result_classes_after_nms;
+ std::vector<float> result_scores_after_nms;
+ std::vector<unsigned int> sorted_indices;
+
for(unsigned int c = 0; c < num_classes; ++c)
{
// For each boxes get scores of the boxes for the class c
@@ -299,6 +289,8 @@ void CPPDetectionPostProcessLayer::run()
*(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
}
+
+ // Run Non-maxima Suppression
_nms.run();
for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
@@ -307,67 +299,74 @@ void CPPDetectionPostProcessLayer::run()
if(selected_index == -1)
{
// Nms will return -1 for all the last M-elements not valid
- continue;
+ break;
}
- _result_idx_boxes_after_nms.emplace_back(selected_index);
- _result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
- _result_classes_after_nms.emplace_back(c);
+ result_idx_boxes_after_nms.emplace_back(selected_index);
+ result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+ result_classes_after_nms.emplace_back(c);
}
}
// We select the max detection numbers of the highest score of all classes
- const auto num_selected = _result_idx_boxes_after_nms.size();
+ const auto num_selected = result_scores_after_nms.size();
const auto num_output = std::min<unsigned int>(max_detections, num_selected);
// Sort selected indices based on result scores
- std::iota(_sorted_indices.begin(), _sorted_indices.end(), 0);
- std::partial_sort(_sorted_indices.data(),
- _sorted_indices.data() + num_output,
- _sorted_indices.data() + num_selected,
+ sorted_indices.resize(num_selected);
+ std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
+ std::partial_sort(sorted_indices.data(),
+ sorted_indices.data() + num_output,
+ sorted_indices.data() + num_selected,
[&](unsigned int first, unsigned int second)
{
- return _result_scores_after_nms[first] > _result_scores_after_nms[second];
+ return result_scores_after_nms[first] > result_scores_after_nms[second];
});
- SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms,
- _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+ SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
+ num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
}
// Fast NMS
else
{
const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
- for(unsigned int b = 0, index = 0; b < _num_boxes; ++b)
+ std::vector<float> max_scores;
+ std::vector<int> box_indices;
+ std::vector<int> max_score_classes;
+
+ for(unsigned int b = 0; b < _num_boxes; ++b)
{
- _box_scores.clear();
- _sorted_indices.clear();
+ std::vector<float> box_scores;
for(unsigned int c = 0; c < num_classes; ++c)
{
- _box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
- _sorted_indices.push_back(c);
+ box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
}
- std::partial_sort(_sorted_indices.data(),
- _sorted_indices.data() + num_classes_per_box,
- _sorted_indices.data() + num_classes,
+
+ std::vector<unsigned int> max_score_indices;
+ max_score_indices.resize(_info.num_classes());
+ std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
+ std::partial_sort(max_score_indices.data(),
+ max_score_indices.data() + num_classes_per_box,
+ max_score_indices.data() + num_classes,
[&](unsigned int first, unsigned int second)
{
- return _box_scores[first] > _box_scores[second];
+ return box_scores[first] > box_scores[second];
});
- for(unsigned int i = 0; i < num_classes_per_box; ++i, ++index)
+ for(unsigned int i = 0; i < num_classes_per_box; ++i)
{
- const float score_to_add = _box_scores[_sorted_indices[i]];
- *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(index)))) = score_to_add;
- _result_scores_after_nms.emplace_back(score_to_add);
- _result_idx_boxes_after_nms.emplace_back(b);
- _result_classes_after_nms.emplace_back(_sorted_indices[i]);
+ const float score_to_add = box_scores[max_score_indices[i]];
+ *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+ max_scores.emplace_back(score_to_add);
+ box_indices.emplace_back(b);
+ max_score_classes.emplace_back(max_score_indices[i]);
}
}
- // Run NMS
+ // Run Non-maxima Suppression
_nms.run();
- _sorted_indices.clear();
+ std::vector<unsigned int> selected_indices;
for(unsigned int i = 0; i < max_detections; ++i)
{
// NMS returns M valid indices, the not valid tail is filled with -1
@@ -376,13 +375,13 @@ void CPPDetectionPostProcessLayer::run()
// Nms will return -1 for all the last M-elements not valid
break;
}
- _sorted_indices.emplace_back(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))));
+ selected_indices.emplace_back(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))));
}
// We select the max detection numbers of the highest score of all classes
- const auto num_output = std::min<unsigned int>(_info.max_detections(), _sorted_indices.size());
+ const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
- SaveOutputs(&_decoded_boxes, _result_idx_boxes_after_nms, _result_scores_after_nms, _result_classes_after_nms,
- _sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+ SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
+ num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
}
}
} // namespace arm_compute \ No newline at end of file