From 883bad7ef34c3429b3338d5544a8cdf3b60cd1e8 Mon Sep 17 00:00:00 2001
From: Isabella Gottardi <isabella.gottardi@arm.com>
Date: Mon, 15 Jul 2019 17:33:07 +0100
Subject: COMPMID-1849: Add DetectorPostProcess operator Part1 - Rework of
 CPPNonMaximumSuppression

Change-Id: I2b34fbd12188db49b0ac050a12312494eeefd819
Signed-off-by: Isabella Gottardi <isabella.gottardi@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1585
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 arm_compute/core/CPP/CPPKernels.h                  |   1 +
 .../CPP/kernels/CPPNonMaximumSuppressionKernel.h   | 102 ++++++++++
 arm_compute/runtime/CPP/CPPFunctions.h             |   1 +
 .../CPP/functions/CPPDetectionOutputLayer.h        |  50 -----
 .../CPP/functions/CPPNonMaximumSuppression.h       |  67 +++++++
 .../CPP/kernels/CPPNonMaximumSuppressionKernel.cpp | 205 +++++++++++++++++++++
 .../CPP/functions/CPPDetectionOutputLayer.cpp      |  95 +---------
 .../CPP/functions/CPPNonMaximumSuppression.cpp     |  46 +++++
 tests/SConscript                                   |   2 +-
 tests/validation/CPP/NonMaximalSuppression.cpp     | 144 ---------------
 tests/validation/CPP/NonMaximumSuppression.cpp     | 144 +++++++++++++++
 tests/validation/reference/NonMaxSuppression.cpp   |  14 +-
 12 files changed, 580 insertions(+), 291 deletions(-)
 create mode 100644 arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
 create mode 100644 arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
 create mode 100644 src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
 create mode 100644 src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
 delete mode 100644 tests/validation/CPP/NonMaximalSuppression.cpp
 create mode 100644 tests/validation/CPP/NonMaximumSuppression.cpp

diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index 70d858220f..30cbf5d52c 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
new file mode 100644
index 0000000000..dd035d5cb1
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H__
+#define __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP Function to perform non maximum suppression on the bounding boxes and scores
+ *
+ */
+class CPPNonMaximumSuppressionKernel : public ICPPKernel
+{
+public:
+    const char *name() const override
+    {
+        return "CPPNonMaximumSuppressionKernel";
+    }
+    /** Default constructor */
+    CPPNonMaximumSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPNonMaximumSuppressionKernel(const CPPNonMaximumSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPNonMaximumSuppressionKernel &operator=(const CPPNonMaximumSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPNonMaximumSuppressionKernel(CPPNonMaximumSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPNonMaximumSuppressionKernel &operator=(CPPNonMaximumSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~CPPNonMaximumSuppressionKernel() = default;
+
+    /** Configure the kernel to perform non maximal suppression
+     *
+     * @param[in]  input_bboxes    The input bounding boxes. Data types supported: F32.
+     * @param[in]  input_scores    The corresponding input confidence. Same as @p input_bboxes.
+     * @param[out] output_indices  The kept indices of bboxes after nms. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  iou_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    void configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, const float score_threshold, const float iou_threshold);
+
+    /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppressionKernel
+     *
+     * @param[in]  input_bboxes    The input bounding boxes tensor info. Data types supported: F32.
+     * @param[in]  input_scores    The corresponding input confidence tensor info. Same as @p input_bboxes.
+     * @param[out] output_indices  The kept indices of bboxes after nms tensor info. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  iou_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    static Status validate(const ITensorInfo *input_bboxes, const ITensorInfo *input_scores, const ITensorInfo *output_indices, unsigned int max_output_size,
+                           const float score_threshold, const float iou_threshold);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input_bboxes;
+    const ITensor *_input_scores;
+    ITensor       *_output_indices;
+    unsigned int   _max_output_size;
+    float          _score_threshold;
+    float          _iou_threshold;
+
+    unsigned int _num_boxes;
+
+    std::vector<float>        _scores_above_thd_vector;
+    std::vector<int>          _indices_above_thd_vector;
+    std::vector<bool>         _visited;
+    std::vector<unsigned int> _sorted_indices;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H__ */
diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h
index 4bb668fc83..1dff03f349 100644
--- a/arm_compute/runtime/CPP/CPPFunctions.h
+++ b/arm_compute/runtime/CPP/CPPFunctions.h
@@ -27,6 +27,7 @@
 /* Header regrouping all the CPP functions */
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+#include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 #include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
 #include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index 8c610f3ec2..71be8a0ad8 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -39,56 +39,6 @@ using NormalizedBBox = std::array<float, 4>;
 // LabelBBox used for map label and bounding box
 using LabelBBox = std::map<int, std::vector<NormalizedBBox>>;
 
-/** CPP Function to perform non maximum suppression on the bounding boxes and scores
- *
- */
-class CPPNonMaximumSuppression : public IFunction
-{
-public:
-    /** Default constructor */
-    CPPNonMaximumSuppression();
-    /** Configure the function to perform non maximal suppression
-     *
-     * @param[in]  bboxes          The input bounding boxes. Data types supported: F32.
-     * @param[in]  scores          The corresponding input confidence. Same as @p scores.
-     * @param[out] indices         The kept indices of bboxes after nms. Data types supported: S32.
-     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
-     * @param[in]  score_threshold The threshold used to filter detection results.
-     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
-     *
-     */
-    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
-
-    /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
-     *
-     * @param[in]  bboxes          The input bounding boxes. Data types supported: F32.
-     * @param[in]  scores          The corresponding input confidence. Same as @p scores.
-     * @param[out] indices         The kept indices of bboxes after nms. Data types supported: S32.
-     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
-     * @param[in]  score_threshold The threshold used to filter detection results.
-     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
-     *
-     */
-    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-                           const float score_threshold, const float nms_threshold);
-
-    // Inherited methods overridden:
-    void run() override;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPNonMaximumSuppression(const CPPNonMaximumSuppression &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPNonMaximumSuppression &operator=(const CPPNonMaximumSuppression &) = delete;
-
-private:
-    const ITensor *_bboxes;
-    const ITensor *_scores;
-    ITensor       *_indices;
-    unsigned int   _max_output_size;
-
-    float _score_threshold;
-    float _nms_threshold;
-};
-
 /** CPP Function to generate the detection output based on location and confidence
  * predictions by doing non maximum suppression.
  *
diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
new file mode 100644
index 0000000000..dfb3a81b7f
--- /dev/null
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H__
+#define __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP Function to perform non maximum suppression on the bounding boxes and scores
+ *
+ */
+class CPPNonMaximumSuppression : public ICPPSimpleFunction
+{
+public:
+    /** Configure the function to perform non maximal suppression
+     *
+     * @param[in]  bboxes          The input bounding boxes. Data types supported: F32.
+     * @param[in]  scores          The corresponding input confidence. Same as @p bboxes.
+     * @param[out] indices         The kept indices of bboxes after nms. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+
+    /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
+     *
+     * @param[in]  bboxes          The input bounding boxes tensor info. Data types supported: F32.
+     * @param[in]  scores          The corresponding input confidence tensor info. Same as @p bboxes.
+     * @param[out] indices         The kept indices of bboxes after nms tensor info. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+                           const float score_threshold, const float nms_threshold);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H__ */
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
new file mode 100644
index 0000000000..fb38bdcf94
--- /dev/null
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
+                          const float score_threshold, const float iou_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(iou_threshold < 0.f || iou_threshold > 1.f, "IOU threshold must be in [0,1]");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Score threshold must be in [0,1]");
+
+    return Status{};
+}
+} // namespace
+
+CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
+    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0), _scores_above_thd_vector(),
+      _indices_above_thd_vector(), _visited(), _sorted_indices()
+{
+}
+
+void CPPNonMaximumSuppressionKernel::configure(
+    const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size,
+    const float score_threshold, const float iou_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+
+    auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
+
+    _input_bboxes    = input_bboxes;
+    _input_scores    = input_scores;
+    _output_indices  = output_indices;
+    _score_threshold = score_threshold;
+    _iou_threshold   = iou_threshold;
+    _max_output_size = max_output_size;
+    _num_boxes       = input_scores->info()->dimension(0);
+
+    _scores_above_thd_vector.reserve(_num_boxes);
+    _indices_above_thd_vector.reserve(_num_boxes);
+
+    // Visited and sorted_indices are preallocated as num_boxes size, which is the maximum size possible
+    // Will be used only N elements where N is the number of score above the threshold
+    _visited.reserve(_num_boxes);
+    _sorted_indices.reserve(_num_boxes);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output_indices->info(), Steps());
+
+    // The CPPNonMaximumSuppressionKernel doesn't need padding so update_window_and_padding() can be skipped
+    ICPPKernel::configure(win);
+}
+
+Status CPPNonMaximumSuppressionKernel::validate(
+    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
+    const float score_threshold, const float iou_threshold)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    return Status{};
+}
+
+void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    unsigned int num_above_thd = 0;
+    for(unsigned int i = 0; i < _num_boxes; ++i)
+    {
+        const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
+        if(score_i >= _score_threshold)
+        {
+            _indices_above_thd_vector.emplace_back(i);
+            _scores_above_thd_vector.emplace_back(score_i);
+            // Initialize respective index and visited
+            _sorted_indices.emplace_back(num_above_thd);
+            _visited.emplace_back(false);
+            ++num_above_thd;
+        }
+    }
+
+    // Sort selected indices based on scores
+    std::sort(_sorted_indices.begin(),
+              _sorted_indices.end(),
+              [&](unsigned int first, unsigned int second)
+    {
+        return _scores_above_thd_vector[first] > _scores_above_thd_vector[second];
+    });
+
+    // Number of output is the minimum between max_detection and the scores above the threshold
+    const unsigned int num_output = std::min(_max_output_size, num_above_thd);
+    unsigned int       output_idx = 0;
+
+    for(unsigned int i = 0; i < num_above_thd; ++i)
+    {
+        // Check if the output is full
+        if(output_idx >= num_output)
+        {
+            break;
+        }
+
+        // Check if it was already visited, if not add it to the output and update the indices counter
+        if(!_visited[_sorted_indices[i]])
+        {
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = _indices_above_thd_vector[_sorted_indices[i]];
+            ++output_idx;
+        }
+        else
+        {
+            continue;
+        }
+
+        // Once added one element at the output check if the next ones overlap and can be skipped
+        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        {
+            if(!_visited[_sorted_indices[j]])
+            {
+                // Calculate IoU
+                const unsigned int i_index = _indices_above_thd_vector[_sorted_indices[i]];
+                const unsigned int j_index = _indices_above_thd_vector[_sorted_indices[j]];
+                // Box-corner format: xmin, ymin, xmax, ymax
+                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+
+                const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
+                const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
+                float       overlap;
+                if(area_i <= 0 || area_j <= 0)
+                {
+                    overlap = 0.0f;
+                }
+                else
+                {
+                    const auto y_min_intersection = std::max<float>(box_i_ymin, box_j_ymin);
+                    const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
+                    const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
+                    const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                }
+
+                if(overlap > _iou_threshold)
+                {
+                    _visited[_sorted_indices[j]] = true;
+                }
+            }
+        }
+    }
+    // The output could be full but not the output indices tensor
+    // Instead return values not valid we put -1
+    for(; output_idx < _max_output_size; ++output_idx)
+    {
+        *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 9a141cb73a..a1f4e6e89c 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 namespace
 {
-Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
@@ -380,97 +380,8 @@ void ApplyNMSFast(const std::vector<NormalizedBBox> &bboxes,
         }
     }
 }
-
-Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-                                              const float score_threshold, const float nms_threshold)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
-
-    return Status{};
-}
 } // namespace
 
-CPPNonMaximumSuppression::CPPNonMaximumSuppression()
-    : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
-{
-}
-
-void CPPNonMaximumSuppression::configure(
-    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
-    ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
-
-    // copy scores also to a vector
-    _bboxes  = bboxes;
-    _scores  = scores;
-    _indices = indices;
-
-    _nms_threshold   = nms_threshold;
-    _max_output_size = max_output_size;
-    _score_threshold = score_threshold;
-}
-
-Status CPPNonMaximumSuppression::validate(
-    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
-    return Status{};
-}
-
-void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
-{
-    Window input_win;
-    input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
-    input_win.set_dimension_step(0U, 4U);
-    input_win.set_dimension_step(1U, 1U);
-    Iterator input(bboxes, input_win);
-    auto     f = [&bboxes_vector, &input](const Coordinates &)
-    {
-        const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
-        bboxes_vector.push_back(NormalizedBBox({ { *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) } }));
-    };
-    execute_window_loop(input_win, f, input);
-}
-
-void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
-{
-    Window window;
-    window.use_tensor_dimensions(scores->info()->tensor_shape());
-    Iterator it(scores, window);
-    auto     f = [&it, &scores_vector](const Coordinates &)
-    {
-        const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
-        scores_vector.push_back(*input_ptr);
-    };
-    execute_window_loop(window, f, it);
-}
-
-void CPPNonMaximumSuppression::run()
-{
-    std::vector<NormalizedBBox> bboxes_vector;
-    std::vector<float>          scores_vector;
-    std::vector<int>            indices_vector;
-    extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
-    extract_scores_from_tensor(_scores, scores_vector);
-    ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
-    std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
-}
-
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
     : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
       _all_prior_variances(), _all_decode_bboxes(), _all_indices()
@@ -488,7 +399,7 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -526,7 +437,7 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
 
 Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
 }
 
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
new file mode 100644
index 0000000000..f13674a42f
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+
+#include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CPPNonMaximumSuppression::configure(
+    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CPPNonMaximumSuppressionKernel>();
+    k->configure(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    _kernel = std::move(k);
+}
+
+Status CPPNonMaximumSuppression::validate(
+    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+}
+} // namespace arm_compute
diff --git a/tests/SConscript b/tests/SConscript
index 103874f63f..c8de603541 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -32,7 +32,7 @@ SConscript('./framework/SConscript', duplicate=0)
 variables = [
     #FIXME: Remove before release!
     BoolVariable("benchmark_examples", "Build benchmark examples programs", True),
-    BoolVariable("validate_examples", "Build benchmark examples programs", True),
+    BoolVariable("validate_examples", "Build validate examples programs", True),
     #FIXME Switch the following two options to False before releasing
     BoolVariable("validation_tests", "Build validation test programs", True),
     BoolVariable("benchmark_tests", "Build benchmark test programs", True),
diff --git a/tests/validation/CPP/NonMaximalSuppression.cpp b/tests/validation/CPP/NonMaximalSuppression.cpp
deleted file mode 100644
index 6cd7b52066..0000000000
--- a/tests/validation/CPP/NonMaximalSuppression.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/ShapeDatasets.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/NonMaxSuppressionFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-const auto max_output_boxes_dataset = framework::dataset::make("MaxOutputBoxes", 1, 10);
-const auto score_threshold_dataset  = framework::dataset::make("ScoreThreshold", { 0.1f, 0.5f, 0.f, 1.f });
-const auto nms_threshold_dataset    = framework::dataset::make("NMSThreshold", { 0.1f, 0.5f, 0.f, 1.f });
-const auto NMSParametersSmall       = datasets::Small2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
-const auto NMSParametersBig         = datasets::Large2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
-
-} // namespace
-
-TEST_SUITE(CPP)
-TEST_SUITE(NMS)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-                                                framework::dataset::make("BoundingBox",{
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(1U, 4U, 2U), 1, DataType::F32),    // invalid shape
-                                                                                        TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),    // invalid data type
-                                                                                        TensorInfo(TensorShape(4U, 3U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 66U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
-                                                                                    }),
-                                                framework::dataset::make("Scores", {
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32), // invalid shape
-                                                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(3U), 1, DataType::U8),  // invalid data type
-                                                                                        TensorInfo(TensorShape(66U), 1, DataType::F32),  // invalid data type
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
-                                                                                    })),
-                                                framework::dataset::make("Indices", {
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(4U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(3U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(200U), 1, DataType::S32), // indices bigger than max bbs, OK because max_output is 66
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32), // invalid data type
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
-
-                                                                                    })),
-                                                framework::dataset::make("max_output", {
-                                                                                        10U, 2U,4U, 3U,66U, 1U,
-                                                                                        0U, /* invalid, must be greater than 0 */
-                                                                                        10000U, /* OK, clamped to indices' size */
-                                                                                        100U,
-                                                                                        10U,
-                                                                                     })),
-                                                framework::dataset::make("score_threshold", {
-                                                                                        0.1f, 0.4f, 0.2f,0.8f,0.3f, 0.01f, 0.5f, 0.45f,
-                                                                                        -1.f, /* invalid value, must be in [0,1] */
-                                                                                        0.5f,
-                                                                                     })),
-                                                framework::dataset::make("nms_threshold", {
-                                                                                        0.3f, 0.7f, 0.1f,0.13f,0.2f, 0.97f, 0.76f, 0.87f, 0.1f,
-                                                                                        10.f, /* invalid value, must be in [0,1]*/
-                                                                                     })),
-                                                framework::dataset::make("Expected", {
-                                                                                        true, false, false, false, true, false, false,true, false, false
-                                                                                     })),
-
-                                            bbox_info, scores_info, indices_info, max_out, score_threshold, nms_threshold, expected)
-{
-    ARM_COMPUTE_EXPECT(bool(CPPNonMaximumSuppression::validate(&bbox_info.clone()->set_is_resizable(false),
-                                                               &scores_info.clone()->set_is_resizable(false),
-                                                               &indices_info.clone()->set_is_resizable(false),
-                                max_out,score_threshold,nms_threshold)) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-using CPPNonMaxSuppressionFixture = NMSValidationFixture<Tensor, Accessor, CPPNonMaximumSuppression>;
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CPPNonMaxSuppressionFixture, framework::DatasetMode::PRECOMMIT, NMSParametersSmall)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CPPNonMaxSuppressionFixture, framework::DatasetMode::NIGHTLY, NMSParametersBig)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-
-TEST_SUITE_END() // CPP
-TEST_SUITE_END() // NMS
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/CPP/NonMaximumSuppression.cpp b/tests/validation/CPP/NonMaximumSuppression.cpp
new file mode 100644
index 0000000000..bf24b2cf0c
--- /dev/null
+++ b/tests/validation/CPP/NonMaximumSuppression.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/NonMaxSuppressionFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const auto max_output_boxes_dataset  = framework::dataset::make("MaxOutputBoxes", 1, 10);
+const auto score_threshold_dataset   = framework::dataset::make("ScoreThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto iou_nms_threshold_dataset = framework::dataset::make("NMSThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto NMSParametersSmall        = datasets::Small2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * iou_nms_threshold_dataset;
+const auto NMSParametersBig          = datasets::Large2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * iou_nms_threshold_dataset;
+
+} // namespace
+
+TEST_SUITE(CPP)
+TEST_SUITE(NMS)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                framework::dataset::make("BoundingBox",{
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U, 4U, 2U), 1, DataType::F32),    // invalid shape
+                                                                                        TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),    // invalid data type
+                                                                                        TensorInfo(TensorShape(4U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 66U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                    }),
+                                                framework::dataset::make("Scores", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32), // invalid shape
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::U8),  // invalid data type
+                                                                                        TensorInfo(TensorShape(66U), 1, DataType::F32),  // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                    })),
+                                                framework::dataset::make("Indices", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(200U), 1, DataType::S32), // indices bigger than max bbs, OK because max_output is 66
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32), // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+
+                                                                                    })),
+                                                framework::dataset::make("max_output", {
+                                                                                        10U, 2U,4U, 3U,66U, 1U,
+                                                                                        0U, /* invalid, must be greater than 0 */
+                                                                                        10000U, /* OK, clamped to indices' size */
+                                                                                        100U,
+                                                                                        10U,
+                                                                                     })),
+                                                framework::dataset::make("score_threshold", {
+                                                                                        0.1f, 0.4f, 0.2f,0.8f,0.3f, 0.01f, 0.5f, 0.45f,
+                                                                                        -1.f, /* invalid value, must be in [0,1] */
+                                                                                        0.5f,
+                                                                                     })),
+                                                framework::dataset::make("nms_threshold", {
+                                                                                        0.3f, 0.7f, 0.1f,0.13f,0.2f, 0.97f, 0.76f, 0.87f, 0.1f,
+                                                                                        10.f, /* invalid value, must be in [0,1]*/
+                                                                                     })),
+                                                framework::dataset::make("Expected", {
+                                                                                        true, false, false, false, true, false, false,true, false, false
+                                                                                     })),
+
+                                            bbox_info, scores_info, indices_info, max_out, score_threshold, nms_threshold, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CPPNonMaximumSuppression::validate(&bbox_info.clone()->set_is_resizable(false),
+                                                               &scores_info.clone()->set_is_resizable(false),
+                                                               &indices_info.clone()->set_is_resizable(false),
+                                max_out,score_threshold,nms_threshold)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CPPNonMaxSuppressionFixture = NMSValidationFixture<Tensor, Accessor, CPPNonMaximumSuppression>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CPPNonMaxSuppressionFixture, framework::DatasetMode::PRECOMMIT, NMSParametersSmall)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CPPNonMaxSuppressionFixture, framework::DatasetMode::NIGHTLY, NMSParametersBig)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // NMS
+TEST_SUITE_END() // CPP
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/NonMaxSuppression.cpp b/tests/validation/reference/NonMaxSuppression.cpp
index 5b7980d2f0..8fc370b7af 100644
--- a/tests/validation/reference/NonMaxSuppression.cpp
+++ b/tests/validation/reference/NonMaxSuppression.cpp
@@ -76,10 +76,10 @@ inline float compute_size(const std::pair<float, float> &min, const std::pair<fl
 inline float compute_intersection(const std::pair<float, float> &b0_min, const std::pair<float, float> &b0_max,
                                   const std::pair<float, float> &b1_min, const std::pair<float, float> &b1_max, float b0_size, float b1_size)
 {
-    const float inter = std::max<float>(std::min<float>(b0_max.first, b1_max.first) - std::max<float>(b0_min.first, b1_min.first), 0.0) * std::max<float>(std::min<float>(b0_max.second,
+    const float inter = std::max<float>(std::min<float>(b0_max.first, b1_max.first) - std::max<float>(b0_min.first, b1_min.first), 0.0f) * std::max<float>(std::min<float>(b0_max.second,
                         b1_max.second)
                         - std::max<float>(b0_min.second, b1_min.second),
-                        0.0);
+                        0.0f);
     return inter / (b0_size + b1_size - inter);
 }
 
@@ -107,7 +107,7 @@ inline std::vector<CandidateBox> get_candidates(const SimpleTensor<float> &score
     std::vector<CandidateBox> candidates_vector;
     for(int i = 0; i < scores.num_elements(); ++i)
     {
-        if(scores[i] > threshold)
+        if(scores[i] >= threshold)
         {
             const auto cb = CandidateBox({ i, scores[i] });
             candidates_vector.push_back(cb);
@@ -115,7 +115,7 @@ inline std::vector<CandidateBox> get_candidates(const SimpleTensor<float> &score
     }
     std::stable_sort(candidates_vector.begin(), candidates_vector.end(), [](const CandidateBox bb0, const CandidateBox bb1)
     {
-        return bb0.second >= bb1.second;
+        return bb0.second > bb1.second;
     });
     return candidates_vector;
 }
@@ -155,6 +155,12 @@ SimpleTensor<int> non_max_suppression(const SimpleTensor<float> &bboxes, const S
         }
     }
     std::copy_n(selected.begin(), selected.size(), indices.data());
+
+    for(unsigned int i = selected.size(); i < max_output_size; ++i)
+    {
+        indices[i] = -1;
+    }
+
     return indices;
 }
 } // namespace reference
-- 
cgit v1.2.1